Index: head/contrib/compiler-rt/lib/tsan/rtl/tsan_platform.h
===================================================================
--- head/contrib/compiler-rt/lib/tsan/rtl/tsan_platform.h	(revision 328752)
+++ head/contrib/compiler-rt/lib/tsan/rtl/tsan_platform.h	(revision 328753)
@@ -1,924 +1,926 @@
 //===-- tsan_platform.h -----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file is a part of ThreadSanitizer (TSan), a race detector.
 //
 // Platform-specific code.
 //===----------------------------------------------------------------------===//
 
 #ifndef TSAN_PLATFORM_H
 #define TSAN_PLATFORM_H
 
 #if !defined(__LP64__) && !defined(_WIN64)
 # error "Only 64-bit is supported"
 #endif
 
 #include "tsan_defs.h"
 #include "tsan_trace.h"
 
 namespace __tsan {
 
 #if !SANITIZER_GO
 
 #if defined(__x86_64__)
 /*
 C/C++ on linux/x86_64 and freebsd/x86_64
 0000 0000 1000 - 0080 0000 0000: main binary and/or MAP_32BIT mappings (512GB)
 0040 0000 0000 - 0100 0000 0000: -
 0100 0000 0000 - 2000 0000 0000: shadow
 2000 0000 0000 - 3000 0000 0000: -
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
 4000 0000 0000 - 5500 0000 0000: -
 5500 0000 0000 - 5680 0000 0000: pie binaries without ASLR or on 4.1+ kernels
 5680 0000 0000 - 6000 0000 0000: -
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 7d00 0000 0000: -
 7b00 0000 0000 - 7c00 0000 0000: heap
 7c00 0000 0000 - 7e80 0000 0000: -
 7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
 
 C/C++ on netbsd/amd64 can reuse the same mapping:
  * The address space starts from 0x1000 (option with 0x0) and ends with
    0x7f7ffffff000.
  * LoAppMem-kHeapMemEnd can be reused as it is.
  * No VDSO support.
  * No MidAppMem region.
  * No additional HeapMem region.
  * HiAppMem contains the stack, loader, shared libraries and heap.
  * Stack on NetBSD/amd64 has prereserved 128MB.
  * Heap grows downwards (top-down).
  * ASLR must be disabled per-process or globally.
 
 */
 struct Mapping {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x340000000000ull;
   static const uptr kTraceMemBeg   = 0x600000000000ull;
   static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x200000000000ull;
   static const uptr kHeapMemBeg    = 0x7b0000000000ull;
   static const uptr kHeapMemEnd    = 0x7c0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000001000ull;
   static const uptr kLoAppMemEnd   = 0x008000000000ull;
   static const uptr kMidAppMemBeg  = 0x550000000000ull;
   static const uptr kMidAppMemEnd  = 0x568000000000ull;
   static const uptr kHiAppMemBeg   = 0x7e8000000000ull;
   static const uptr kHiAppMemEnd   = 0x800000000000ull;
   static const uptr kAppMemMsk     = 0x780000000000ull;
   static const uptr kAppMemXor     = 0x040000000000ull;
   static const uptr kVdsoBeg       = 0xf000000000000000ull;
 };
 
 #define TSAN_MID_APP_RANGE 1
 #elif defined(__mips64)
 /*
-C/C++ on linux/mips64
-0100 0000 00 - 0200 0000 00: main binary
-0200 0000 00 - 1400 0000 00: -
-1400 0000 00 - 2400 0000 00: shadow
-2400 0000 00 - 3000 0000 00: -
-3000 0000 00 - 4000 0000 00: metainfo (memory blocks and sync objects)
-4000 0000 00 - 6000 0000 00: -
-6000 0000 00 - 6200 0000 00: traces
-6200 0000 00 - fe00 0000 00: -
-fe00 0000 00 - ff00 0000 00: heap
-ff00 0000 00 - ff80 0000 00: -
-ff80 0000 00 - ffff ffff ff: modules and main thread stack
+C/C++ on linux/mips64 (40-bit VMA)
+0000 0000 00 - 0100 0000 00: -                                           (4 GB)
+0100 0000 00 - 0200 0000 00: main binary                                 (4 GB)
+0200 0000 00 - 2000 0000 00: -                                         (120 GB)
+2000 0000 00 - 4000 0000 00: shadow                                    (128 GB)
+4000 0000 00 - 5000 0000 00: metainfo (memory blocks and sync objects)  (64 GB)
+5000 0000 00 - aa00 0000 00: -                                         (360 GB)
+aa00 0000 00 - ab00 0000 00: main binary (PIE)                           (4 GB)
+ab00 0000 00 - b000 0000 00: -                                          (20 GB)
+b000 0000 00 - b200 0000 00: traces                                      (8 GB)
+b200 0000 00 - fe00 0000 00: -                                         (304 GB)
+fe00 0000 00 - ff00 0000 00: heap                                        (4 GB)
+ff00 0000 00 - ff80 0000 00: -                                           (2 GB)
+ff80 0000 00 - ffff ffff ff: modules and main thread stack              (<2 GB)
 */
 struct Mapping {
   static const uptr kMetaShadowBeg = 0x4000000000ull;
   static const uptr kMetaShadowEnd = 0x5000000000ull;
   static const uptr kTraceMemBeg   = 0xb000000000ull;
   static const uptr kTraceMemEnd   = 0xb200000000ull;
-  static const uptr kShadowBeg     = 0x2400000000ull;
+  static const uptr kShadowBeg     = 0x2000000000ull;
   static const uptr kShadowEnd     = 0x4000000000ull;
   static const uptr kHeapMemBeg    = 0xfe00000000ull;
   static const uptr kHeapMemEnd    = 0xff00000000ull;
   static const uptr kLoAppMemBeg   = 0x0100000000ull;
   static const uptr kLoAppMemEnd   = 0x0200000000ull;
   static const uptr kMidAppMemBeg  = 0xaa00000000ull;
   static const uptr kMidAppMemEnd  = 0xab00000000ull;
   static const uptr kHiAppMemBeg   = 0xff80000000ull;
   static const uptr kHiAppMemEnd   = 0xffffffffffull;
   static const uptr kAppMemMsk     = 0xf800000000ull;
   static const uptr kAppMemXor     = 0x0800000000ull;
   static const uptr kVdsoBeg       = 0xfffff00000ull;
 };
 
 #define TSAN_MID_APP_RANGE 1
 #elif defined(__aarch64__) && defined(__APPLE__)
 /*
 C/C++ on Darwin/iOS/ARM64 (36-bit VMA, 64 GB VM)
 0000 0000 00 - 0100 0000 00: -                                    (4 GB)
 0100 0000 00 - 0200 0000 00: main binary, modules, thread stacks  (4 GB)
 0200 0000 00 - 0300 0000 00: heap                                 (4 GB)
 0300 0000 00 - 0400 0000 00: -                                    (4 GB)
 0400 0000 00 - 0c00 0000 00: shadow memory                       (32 GB)
 0c00 0000 00 - 0d00 0000 00: -                                    (4 GB)
 0d00 0000 00 - 0e00 0000 00: metainfo                             (4 GB)
 0e00 0000 00 - 0f00 0000 00: -                                    (4 GB)
 0f00 0000 00 - 0fc0 0000 00: traces                               (3 GB)
 0fc0 0000 00 - 1000 0000 00: -
 */
 struct Mapping {
   static const uptr kLoAppMemBeg   = 0x0100000000ull;
   static const uptr kLoAppMemEnd   = 0x0200000000ull;
   static const uptr kHeapMemBeg    = 0x0200000000ull;
   static const uptr kHeapMemEnd    = 0x0300000000ull;
   static const uptr kShadowBeg     = 0x0400000000ull;
   static const uptr kShadowEnd     = 0x0c00000000ull;
   static const uptr kMetaShadowBeg = 0x0d00000000ull;
   static const uptr kMetaShadowEnd = 0x0e00000000ull;
   static const uptr kTraceMemBeg   = 0x0f00000000ull;
   static const uptr kTraceMemEnd   = 0x0fc0000000ull;
   static const uptr kHiAppMemBeg   = 0x0fc0000000ull;
   static const uptr kHiAppMemEnd   = 0x0fc0000000ull;
   static const uptr kAppMemMsk     =          0x0ull;
   static const uptr kAppMemXor     =          0x0ull;
   static const uptr kVdsoBeg       = 0x7000000000000000ull;
 };
 
 #elif defined(__aarch64__)
 // AArch64 supports multiple VMA which leads to multiple address transformation
 // functions.  To support these multiple VMAS transformations and mappings TSAN
 // runtime for AArch64 uses an external memory read (vmaSize) to select which
 // mapping to use.  Although slower, it make a same instrumented binary run on
 // multiple kernels.
 
 /*
 C/C++ on linux/aarch64 (39-bit VMA)
 0000 0010 00 - 0100 0000 00: main binary
 0100 0000 00 - 0800 0000 00: -
 0800 0000 00 - 2000 0000 00: shadow memory
 2000 0000 00 - 3100 0000 00: -
 3100 0000 00 - 3400 0000 00: metainfo
 3400 0000 00 - 5500 0000 00: -
 5500 0000 00 - 5600 0000 00: main binary (PIE)
 5600 0000 00 - 6000 0000 00: -
 6000 0000 00 - 6200 0000 00: traces
 6200 0000 00 - 7d00 0000 00: -
 7c00 0000 00 - 7d00 0000 00: heap
 7d00 0000 00 - 7fff ffff ff: modules and main thread stack
 */
 struct Mapping39 {
   static const uptr kLoAppMemBeg   = 0x0000001000ull;
   static const uptr kLoAppMemEnd   = 0x0100000000ull;
   static const uptr kShadowBeg     = 0x0800000000ull;
   static const uptr kShadowEnd     = 0x2000000000ull;
   static const uptr kMetaShadowBeg = 0x3100000000ull;
   static const uptr kMetaShadowEnd = 0x3400000000ull;
   static const uptr kMidAppMemBeg  = 0x5500000000ull;
   static const uptr kMidAppMemEnd  = 0x5600000000ull;
   static const uptr kTraceMemBeg   = 0x6000000000ull;
   static const uptr kTraceMemEnd   = 0x6200000000ull;
   static const uptr kHeapMemBeg    = 0x7c00000000ull;
   static const uptr kHeapMemEnd    = 0x7d00000000ull;
   static const uptr kHiAppMemBeg   = 0x7e00000000ull;
   static const uptr kHiAppMemEnd   = 0x7fffffffffull;
   static const uptr kAppMemMsk     = 0x7800000000ull;
   static const uptr kAppMemXor     = 0x0200000000ull;
   static const uptr kVdsoBeg       = 0x7f00000000ull;
 };
 
 /*
 C/C++ on linux/aarch64 (42-bit VMA)
 00000 0010 00 - 01000 0000 00: main binary
 01000 0000 00 - 10000 0000 00: -
 10000 0000 00 - 20000 0000 00: shadow memory
 20000 0000 00 - 26000 0000 00: -
 26000 0000 00 - 28000 0000 00: metainfo
 28000 0000 00 - 2aa00 0000 00: -
 2aa00 0000 00 - 2ab00 0000 00: main binary (PIE)
 2ab00 0000 00 - 36200 0000 00: -
 36200 0000 00 - 36240 0000 00: traces
 36240 0000 00 - 3e000 0000 00: -
 3e000 0000 00 - 3f000 0000 00: heap
 3f000 0000 00 - 3ffff ffff ff: modules and main thread stack
 */
 struct Mapping42 {
   static const uptr kLoAppMemBeg   = 0x00000001000ull;
   static const uptr kLoAppMemEnd   = 0x01000000000ull;
   static const uptr kShadowBeg     = 0x10000000000ull;
   static const uptr kShadowEnd     = 0x20000000000ull;
   static const uptr kMetaShadowBeg = 0x26000000000ull;
   static const uptr kMetaShadowEnd = 0x28000000000ull;
   static const uptr kMidAppMemBeg  = 0x2aa00000000ull;
   static const uptr kMidAppMemEnd  = 0x2ab00000000ull;
   static const uptr kTraceMemBeg   = 0x36200000000ull;
   static const uptr kTraceMemEnd   = 0x36400000000ull;
   static const uptr kHeapMemBeg    = 0x3e000000000ull;
   static const uptr kHeapMemEnd    = 0x3f000000000ull;
   static const uptr kHiAppMemBeg   = 0x3f000000000ull;
   static const uptr kHiAppMemEnd   = 0x3ffffffffffull;
   static const uptr kAppMemMsk     = 0x3c000000000ull;
   static const uptr kAppMemXor     = 0x04000000000ull;
   static const uptr kVdsoBeg       = 0x37f00000000ull;
 };
 
 struct Mapping48 {
   static const uptr kLoAppMemBeg   = 0x0000000001000ull;
   static const uptr kLoAppMemEnd   = 0x0000200000000ull;
   static const uptr kShadowBeg     = 0x0002000000000ull;
   static const uptr kShadowEnd     = 0x0004000000000ull;
   static const uptr kMetaShadowBeg = 0x0005000000000ull;
   static const uptr kMetaShadowEnd = 0x0006000000000ull;
   static const uptr kMidAppMemBeg  = 0x0aaaa00000000ull;
   static const uptr kMidAppMemEnd  = 0x0aaaf00000000ull;
   static const uptr kTraceMemBeg   = 0x0f06000000000ull;
   static const uptr kTraceMemEnd   = 0x0f06200000000ull;
   static const uptr kHeapMemBeg    = 0x0ffff00000000ull;
   static const uptr kHeapMemEnd    = 0x0ffff00000000ull;
   static const uptr kHiAppMemBeg   = 0x0ffff00000000ull;
   static const uptr kHiAppMemEnd   = 0x1000000000000ull;
   static const uptr kAppMemMsk     = 0x0fff800000000ull;
   static const uptr kAppMemXor     = 0x0000800000000ull;
   static const uptr kVdsoBeg       = 0xffff000000000ull;
 };
 
 // Indicates the runtime will define the memory regions at runtime.
 #define TSAN_RUNTIME_VMA 1
 // Indicates that mapping defines a mid range memory segment.
 #define TSAN_MID_APP_RANGE 1
 #elif defined(__powerpc64__)
 // PPC64 supports multiple VMA which leads to multiple address transformation
 // functions.  To support these multiple VMAS transformations and mappings TSAN
 // runtime for PPC64 uses an external memory read (vmaSize) to select which
 // mapping to use.  Although slower, it make a same instrumented binary run on
 // multiple kernels.
 
 /*
 C/C++ on linux/powerpc64 (44-bit VMA)
 0000 0000 0100 - 0001 0000 0000: main binary
 0001 0000 0000 - 0001 0000 0000: -
 0001 0000 0000 - 0b00 0000 0000: shadow
 0b00 0000 0000 - 0b00 0000 0000: -
 0b00 0000 0000 - 0d00 0000 0000: metainfo (memory blocks and sync objects)
 0d00 0000 0000 - 0d00 0000 0000: -
 0d00 0000 0000 - 0f00 0000 0000: traces
 0f00 0000 0000 - 0f00 0000 0000: -
 0f00 0000 0000 - 0f50 0000 0000: heap
 0f50 0000 0000 - 0f60 0000 0000: -
 0f60 0000 0000 - 1000 0000 0000: modules and main thread stack
 */
 struct Mapping44 {
   static const uptr kMetaShadowBeg = 0x0b0000000000ull;
   static const uptr kMetaShadowEnd = 0x0d0000000000ull;
   static const uptr kTraceMemBeg   = 0x0d0000000000ull;
   static const uptr kTraceMemEnd   = 0x0f0000000000ull;
   static const uptr kShadowBeg     = 0x000100000000ull;
   static const uptr kShadowEnd     = 0x0b0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000000100ull;
   static const uptr kLoAppMemEnd   = 0x000100000000ull;
   static const uptr kHeapMemBeg    = 0x0f0000000000ull;
   static const uptr kHeapMemEnd    = 0x0f5000000000ull;
   static const uptr kHiAppMemBeg   = 0x0f6000000000ull;
   static const uptr kHiAppMemEnd   = 0x100000000000ull; // 44 bits
   static const uptr kAppMemMsk     = 0x0f0000000000ull;
   static const uptr kAppMemXor     = 0x002100000000ull;
   static const uptr kVdsoBeg       = 0x3c0000000000000ull;
 };
 
 /*
 C/C++ on linux/powerpc64 (46-bit VMA)
 0000 0000 1000 - 0100 0000 0000: main binary
 0100 0000 0000 - 0200 0000 0000: -
 0100 0000 0000 - 1000 0000 0000: shadow
 1000 0000 0000 - 1000 0000 0000: -
 1000 0000 0000 - 2000 0000 0000: metainfo (memory blocks and sync objects)
 2000 0000 0000 - 2000 0000 0000: -
 2000 0000 0000 - 2200 0000 0000: traces
 2200 0000 0000 - 3d00 0000 0000: -
 3d00 0000 0000 - 3e00 0000 0000: heap
 3e00 0000 0000 - 3e80 0000 0000: -
 3e80 0000 0000 - 4000 0000 0000: modules and main thread stack
 */
 struct Mapping46 {
   static const uptr kMetaShadowBeg = 0x100000000000ull;
   static const uptr kMetaShadowEnd = 0x200000000000ull;
   static const uptr kTraceMemBeg   = 0x200000000000ull;
   static const uptr kTraceMemEnd   = 0x220000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x100000000000ull;
   static const uptr kHeapMemBeg    = 0x3d0000000000ull;
   static const uptr kHeapMemEnd    = 0x3e0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000001000ull;
   static const uptr kLoAppMemEnd   = 0x010000000000ull;
   static const uptr kHiAppMemBeg   = 0x3e8000000000ull;
   static const uptr kHiAppMemEnd   = 0x400000000000ull; // 46 bits
   static const uptr kAppMemMsk     = 0x3c0000000000ull;
   static const uptr kAppMemXor     = 0x020000000000ull;
   static const uptr kVdsoBeg       = 0x7800000000000000ull;
 };
 
 /*
 C/C++ on linux/powerpc64 (47-bit VMA)
 0000 0000 1000 - 0100 0000 0000: main binary
 0100 0000 0000 - 0200 0000 0000: -
 0100 0000 0000 - 1000 0000 0000: shadow
 1000 0000 0000 - 1000 0000 0000: -
 1000 0000 0000 - 2000 0000 0000: metainfo (memory blocks and sync objects)
 2000 0000 0000 - 2000 0000 0000: -
 2000 0000 0000 - 2200 0000 0000: traces
 2200 0000 0000 - 7d00 0000 0000: -
 7d00 0000 0000 - 7e00 0000 0000: heap
 7e00 0000 0000 - 7e80 0000 0000: -
 7e80 0000 0000 - 8000 0000 0000: modules and main thread stack
 */
 struct Mapping47 {
   static const uptr kMetaShadowBeg = 0x100000000000ull;
   static const uptr kMetaShadowEnd = 0x200000000000ull;
   static const uptr kTraceMemBeg   = 0x200000000000ull;
   static const uptr kTraceMemEnd   = 0x220000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x100000000000ull;
   static const uptr kHeapMemBeg    = 0x7d0000000000ull;
   static const uptr kHeapMemEnd    = 0x7e0000000000ull;
   static const uptr kLoAppMemBeg   = 0x000000001000ull;
   static const uptr kLoAppMemEnd   = 0x010000000000ull;
   static const uptr kHiAppMemBeg   = 0x7e8000000000ull;
   static const uptr kHiAppMemEnd   = 0x800000000000ull; // 47 bits
   static const uptr kAppMemMsk     = 0x7c0000000000ull;
   static const uptr kAppMemXor     = 0x020000000000ull;
   static const uptr kVdsoBeg       = 0x7800000000000000ull;
 };
 
 // Indicates the runtime will define the memory regions at runtime.
 #define TSAN_RUNTIME_VMA 1
 #endif
 
 #elif SANITIZER_GO && !SANITIZER_WINDOWS
 
 /* Go on linux, darwin and freebsd
 0000 0000 1000 - 0000 1000 0000: executable
 0000 1000 0000 - 00c0 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 2000 0000 0000: -
 2000 0000 0000 - 2380 0000 0000: shadow
 2380 0000 0000 - 3000 0000 0000: -
 3000 0000 0000 - 4000 0000 0000: metainfo (memory blocks and sync objects)
 4000 0000 0000 - 6000 0000 0000: -
 6000 0000 0000 - 6200 0000 0000: traces
 6200 0000 0000 - 8000 0000 0000: -
 */
 
 struct Mapping {
   static const uptr kMetaShadowBeg = 0x300000000000ull;
   static const uptr kMetaShadowEnd = 0x400000000000ull;
   static const uptr kTraceMemBeg   = 0x600000000000ull;
   static const uptr kTraceMemEnd   = 0x620000000000ull;
   static const uptr kShadowBeg     = 0x200000000000ull;
   static const uptr kShadowEnd     = 0x238000000000ull;
   static const uptr kAppMemBeg     = 0x000000001000ull;
   static const uptr kAppMemEnd     = 0x00e000000000ull;
 };
 
 #elif SANITIZER_GO && SANITIZER_WINDOWS
 
 /* Go on windows
 0000 0000 1000 - 0000 1000 0000: executable
 0000 1000 0000 - 00f8 0000 0000: -
 00c0 0000 0000 - 00e0 0000 0000: heap
 00e0 0000 0000 - 0100 0000 0000: -
 0100 0000 0000 - 0500 0000 0000: shadow
 0500 0000 0000 - 0560 0000 0000: -
 0560 0000 0000 - 0760 0000 0000: traces
 0760 0000 0000 - 07d0 0000 0000: metainfo (memory blocks and sync objects)
 07d0 0000 0000 - 8000 0000 0000: -
 */
 
 struct Mapping {
   static const uptr kMetaShadowBeg = 0x076000000000ull;
   static const uptr kMetaShadowEnd = 0x07d000000000ull;
   static const uptr kTraceMemBeg   = 0x056000000000ull;
   static const uptr kTraceMemEnd   = 0x076000000000ull;
   static const uptr kShadowBeg     = 0x010000000000ull;
   static const uptr kShadowEnd     = 0x050000000000ull;
   static const uptr kAppMemBeg     = 0x000000001000ull;
   static const uptr kAppMemEnd     = 0x00e000000000ull;
 };
 
 #else
 # error "Unknown platform"
 #endif
 
 
 #ifdef TSAN_RUNTIME_VMA
 extern uptr vmaSize;
 #endif
 
 
 enum MappingType {
   MAPPING_LO_APP_BEG,
   MAPPING_LO_APP_END,
   MAPPING_HI_APP_BEG,
   MAPPING_HI_APP_END,
 #ifdef TSAN_MID_APP_RANGE
   MAPPING_MID_APP_BEG,
   MAPPING_MID_APP_END,
 #endif
   MAPPING_HEAP_BEG,
   MAPPING_HEAP_END,
   MAPPING_APP_BEG,
   MAPPING_APP_END,
   MAPPING_SHADOW_BEG,
   MAPPING_SHADOW_END,
   MAPPING_META_SHADOW_BEG,
   MAPPING_META_SHADOW_END,
   MAPPING_TRACE_BEG,
   MAPPING_TRACE_END,
   MAPPING_VDSO_BEG,
 };
 
 template<typename Mapping, int Type>
 uptr MappingImpl(void) {
   switch (Type) {
 #if !SANITIZER_GO
     case MAPPING_LO_APP_BEG: return Mapping::kLoAppMemBeg;
     case MAPPING_LO_APP_END: return Mapping::kLoAppMemEnd;
 # ifdef TSAN_MID_APP_RANGE
     case MAPPING_MID_APP_BEG: return Mapping::kMidAppMemBeg;
     case MAPPING_MID_APP_END: return Mapping::kMidAppMemEnd;
 # endif
     case MAPPING_HI_APP_BEG: return Mapping::kHiAppMemBeg;
     case MAPPING_HI_APP_END: return Mapping::kHiAppMemEnd;
     case MAPPING_HEAP_BEG: return Mapping::kHeapMemBeg;
     case MAPPING_HEAP_END: return Mapping::kHeapMemEnd;
     case MAPPING_VDSO_BEG: return Mapping::kVdsoBeg;
 #else
     case MAPPING_APP_BEG: return Mapping::kAppMemBeg;
     case MAPPING_APP_END: return Mapping::kAppMemEnd;
 #endif
     case MAPPING_SHADOW_BEG: return Mapping::kShadowBeg;
     case MAPPING_SHADOW_END: return Mapping::kShadowEnd;
     case MAPPING_META_SHADOW_BEG: return Mapping::kMetaShadowBeg;
     case MAPPING_META_SHADOW_END: return Mapping::kMetaShadowEnd;
     case MAPPING_TRACE_BEG: return Mapping::kTraceMemBeg;
     case MAPPING_TRACE_END: return Mapping::kTraceMemEnd;
   }
 }
 
 template<int Type>
 uptr MappingArchImpl(void) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return MappingImpl<Mapping39, Type>();
     case 42: return MappingImpl<Mapping42, Type>();
     case 48: return MappingImpl<Mapping48, Type>();
   }
   DCHECK(0);
   return 0;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return MappingImpl<Mapping44, Type>();
     case 46: return MappingImpl<Mapping46, Type>();
     case 47: return MappingImpl<Mapping47, Type>();
   }
   DCHECK(0);
   return 0;
 #else
   return MappingImpl<Mapping, Type>();
 #endif
 }
 
 #if !SANITIZER_GO
 ALWAYS_INLINE
 uptr LoAppMemBeg(void) {
   return MappingArchImpl<MAPPING_LO_APP_BEG>();
 }
 ALWAYS_INLINE
 uptr LoAppMemEnd(void) {
   return MappingArchImpl<MAPPING_LO_APP_END>();
 }
 
 #ifdef TSAN_MID_APP_RANGE
 ALWAYS_INLINE
 uptr MidAppMemBeg(void) {
   return MappingArchImpl<MAPPING_MID_APP_BEG>();
 }
 ALWAYS_INLINE
 uptr MidAppMemEnd(void) {
   return MappingArchImpl<MAPPING_MID_APP_END>();
 }
 #endif
 
 ALWAYS_INLINE
 uptr HeapMemBeg(void) {
   return MappingArchImpl<MAPPING_HEAP_BEG>();
 }
 ALWAYS_INLINE
 uptr HeapMemEnd(void) {
   return MappingArchImpl<MAPPING_HEAP_END>();
 }
 
 ALWAYS_INLINE
 uptr HiAppMemBeg(void) {
   return MappingArchImpl<MAPPING_HI_APP_BEG>();
 }
 ALWAYS_INLINE
 uptr HiAppMemEnd(void) {
   return MappingArchImpl<MAPPING_HI_APP_END>();
 }
 
 ALWAYS_INLINE
 uptr VdsoBeg(void) {
   return MappingArchImpl<MAPPING_VDSO_BEG>();
 }
 
 #else
 
 ALWAYS_INLINE
 uptr AppMemBeg(void) {
   return MappingArchImpl<MAPPING_APP_BEG>();
 }
 ALWAYS_INLINE
 uptr AppMemEnd(void) {
   return MappingArchImpl<MAPPING_APP_END>();
 }
 
 #endif
 
 static inline
 bool GetUserRegion(int i, uptr *start, uptr *end) {
   switch (i) {
   default:
     return false;
 #if !SANITIZER_GO
   case 0:
     *start = LoAppMemBeg();
     *end = LoAppMemEnd();
     return true;
   case 1:
     *start = HiAppMemBeg();
     *end = HiAppMemEnd();
     return true;
   case 2:
     *start = HeapMemBeg();
     *end = HeapMemEnd();
     return true;
 # ifdef TSAN_MID_APP_RANGE
   case 3:
     *start = MidAppMemBeg();
     *end = MidAppMemEnd();
     return true;
 # endif
 #else
   case 0:
     *start = AppMemBeg();
     *end = AppMemEnd();
     return true;
 #endif
   }
 }
 
 ALWAYS_INLINE
 uptr ShadowBeg(void) {
   return MappingArchImpl<MAPPING_SHADOW_BEG>();
 }
 ALWAYS_INLINE
 uptr ShadowEnd(void) {
   return MappingArchImpl<MAPPING_SHADOW_END>();
 }
 
 ALWAYS_INLINE
 uptr MetaShadowBeg(void) {
   return MappingArchImpl<MAPPING_META_SHADOW_BEG>();
 }
 ALWAYS_INLINE
 uptr MetaShadowEnd(void) {
   return MappingArchImpl<MAPPING_META_SHADOW_END>();
 }
 
 ALWAYS_INLINE
 uptr TraceMemBeg(void) {
   return MappingArchImpl<MAPPING_TRACE_BEG>();
 }
 ALWAYS_INLINE
 uptr TraceMemEnd(void) {
   return MappingArchImpl<MAPPING_TRACE_END>();
 }
 
 
 template<typename Mapping>
 bool IsAppMemImpl(uptr mem) {
 #if !SANITIZER_GO
   return (mem >= Mapping::kHeapMemBeg && mem < Mapping::kHeapMemEnd) ||
 # ifdef TSAN_MID_APP_RANGE
          (mem >= Mapping::kMidAppMemBeg && mem < Mapping::kMidAppMemEnd) ||
 # endif
          (mem >= Mapping::kLoAppMemBeg && mem < Mapping::kLoAppMemEnd) ||
          (mem >= Mapping::kHiAppMemBeg && mem < Mapping::kHiAppMemEnd);
 #else
   return mem >= Mapping::kAppMemBeg && mem < Mapping::kAppMemEnd;
 #endif
 }
 
 ALWAYS_INLINE
 bool IsAppMem(uptr mem) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return IsAppMemImpl<Mapping39>(mem);
     case 42: return IsAppMemImpl<Mapping42>(mem);
     case 48: return IsAppMemImpl<Mapping48>(mem);
   }
   DCHECK(0);
   return false;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return IsAppMemImpl<Mapping44>(mem);
     case 46: return IsAppMemImpl<Mapping46>(mem);
     case 47: return IsAppMemImpl<Mapping47>(mem);
   }
   DCHECK(0);
   return false;
 #else
   return IsAppMemImpl<Mapping>(mem);
 #endif
 }
 
 
 template<typename Mapping>
 bool IsShadowMemImpl(uptr mem) {
   return mem >= Mapping::kShadowBeg && mem <= Mapping::kShadowEnd;
 }
 
 ALWAYS_INLINE
 bool IsShadowMem(uptr mem) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return IsShadowMemImpl<Mapping39>(mem);
     case 42: return IsShadowMemImpl<Mapping42>(mem);
     case 48: return IsShadowMemImpl<Mapping48>(mem);
   }
   DCHECK(0);
   return false;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return IsShadowMemImpl<Mapping44>(mem);
     case 46: return IsShadowMemImpl<Mapping46>(mem);
     case 47: return IsShadowMemImpl<Mapping47>(mem);
   }
   DCHECK(0);
   return false;
 #else
   return IsShadowMemImpl<Mapping>(mem);
 #endif
 }
 
 
 template<typename Mapping>
 bool IsMetaMemImpl(uptr mem) {
   return mem >= Mapping::kMetaShadowBeg && mem <= Mapping::kMetaShadowEnd;
 }
 
 ALWAYS_INLINE
 bool IsMetaMem(uptr mem) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return IsMetaMemImpl<Mapping39>(mem);
     case 42: return IsMetaMemImpl<Mapping42>(mem);
     case 48: return IsMetaMemImpl<Mapping48>(mem);
   }
   DCHECK(0);
   return false;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return IsMetaMemImpl<Mapping44>(mem);
     case 46: return IsMetaMemImpl<Mapping46>(mem);
     case 47: return IsMetaMemImpl<Mapping47>(mem);
   }
   DCHECK(0);
   return false;
 #else
   return IsMetaMemImpl<Mapping>(mem);
 #endif
 }
 
 
 template<typename Mapping>
 uptr MemToShadowImpl(uptr x) {
   DCHECK(IsAppMem(x));
 #if !SANITIZER_GO
   return (((x) & ~(Mapping::kAppMemMsk | (kShadowCell - 1)))
       ^ Mapping::kAppMemXor) * kShadowCnt;
 #else
 # ifndef SANITIZER_WINDOWS
   return ((x & ~(kShadowCell - 1)) * kShadowCnt) | Mapping::kShadowBeg;
 # else
   return ((x & ~(kShadowCell - 1)) * kShadowCnt) + Mapping::kShadowBeg;
 # endif
 #endif
 }
 
 ALWAYS_INLINE
 uptr MemToShadow(uptr x) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return MemToShadowImpl<Mapping39>(x);
     case 42: return MemToShadowImpl<Mapping42>(x);
     case 48: return MemToShadowImpl<Mapping48>(x);
   }
   DCHECK(0);
   return 0;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return MemToShadowImpl<Mapping44>(x);
     case 46: return MemToShadowImpl<Mapping46>(x);
     case 47: return MemToShadowImpl<Mapping47>(x);
   }
   DCHECK(0);
   return 0;
 #else
   return MemToShadowImpl<Mapping>(x);
 #endif
 }
 
 
 template<typename Mapping>
 u32 *MemToMetaImpl(uptr x) {
   DCHECK(IsAppMem(x));
 #if !SANITIZER_GO
   return (u32*)(((((x) & ~(Mapping::kAppMemMsk | (kMetaShadowCell - 1)))) /
       kMetaShadowCell * kMetaShadowSize) | Mapping::kMetaShadowBeg);
 #else
 # ifndef SANITIZER_WINDOWS
   return (u32*)(((x & ~(kMetaShadowCell - 1)) / \
       kMetaShadowCell * kMetaShadowSize) | Mapping::kMetaShadowBeg);
 # else
   return (u32*)(((x & ~(kMetaShadowCell - 1)) / \
       kMetaShadowCell * kMetaShadowSize) + Mapping::kMetaShadowBeg);
 # endif
 #endif
 }
 
 ALWAYS_INLINE
 u32 *MemToMeta(uptr x) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return MemToMetaImpl<Mapping39>(x);
     case 42: return MemToMetaImpl<Mapping42>(x);
     case 48: return MemToMetaImpl<Mapping48>(x);
   }
   DCHECK(0);
   return 0;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return MemToMetaImpl<Mapping44>(x);
     case 46: return MemToMetaImpl<Mapping46>(x);
     case 47: return MemToMetaImpl<Mapping47>(x);
   }
   DCHECK(0);
   return 0;
 #else
   return MemToMetaImpl<Mapping>(x);
 #endif
 }
 
 
 template<typename Mapping>
 uptr ShadowToMemImpl(uptr s) {
   DCHECK(IsShadowMem(s));
 #if !SANITIZER_GO
   // The shadow mapping is non-linear and we've lost some bits, so we don't have
   // an easy way to restore the original app address. But the mapping is a
   // bijection, so we try to restore the address as belonging to low/mid/high
   // range consecutively and see if shadow->app->shadow mapping gives us the
   // same address.
   uptr p = (s / kShadowCnt) ^ Mapping::kAppMemXor;
   if (p >= Mapping::kLoAppMemBeg && p < Mapping::kLoAppMemEnd &&
       MemToShadow(p) == s)
     return p;
 # ifdef TSAN_MID_APP_RANGE
   p = ((s / kShadowCnt) ^ Mapping::kAppMemXor) +
       (Mapping::kMidAppMemBeg & Mapping::kAppMemMsk);
   if (p >= Mapping::kMidAppMemBeg && p < Mapping::kMidAppMemEnd &&
       MemToShadow(p) == s)
     return p;
 # endif
   return ((s / kShadowCnt) ^ Mapping::kAppMemXor) | Mapping::kAppMemMsk;
 #else  // #if !SANITIZER_GO
 # ifndef SANITIZER_WINDOWS
   return (s & ~Mapping::kShadowBeg) / kShadowCnt;
 # else
   return (s - Mapping::kShadowBeg) / kShadowCnt;
 # endif // SANITIZER_WINDOWS
 #endif
 }
 
 ALWAYS_INLINE
 uptr ShadowToMem(uptr s) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return ShadowToMemImpl<Mapping39>(s);
     case 42: return ShadowToMemImpl<Mapping42>(s);
     case 48: return ShadowToMemImpl<Mapping48>(s);
   }
   DCHECK(0);
   return 0;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return ShadowToMemImpl<Mapping44>(s);
     case 46: return ShadowToMemImpl<Mapping46>(s);
     case 47: return ShadowToMemImpl<Mapping47>(s);
   }
   DCHECK(0);
   return 0;
 #else
   return ShadowToMemImpl<Mapping>(s);
 #endif
 }
 
 
 
 // The additional page is to catch shadow stack overflow as paging fault.
 // Windows wants 64K alignment for mmaps.
 const uptr kTotalTraceSize = (kTraceSize * sizeof(Event) + sizeof(Trace)
     + (64 << 10) + (64 << 10) - 1) & ~((64 << 10) - 1);
 
 template<typename Mapping>
 uptr GetThreadTraceImpl(int tid) {
   uptr p = Mapping::kTraceMemBeg + (uptr)tid * kTotalTraceSize;
   DCHECK_LT(p, Mapping::kTraceMemEnd);
   return p;
 }
 
 ALWAYS_INLINE
 uptr GetThreadTrace(int tid) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return GetThreadTraceImpl<Mapping39>(tid);
     case 42: return GetThreadTraceImpl<Mapping42>(tid);
     case 48: return GetThreadTraceImpl<Mapping48>(tid);
   }
   DCHECK(0);
   return 0;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return GetThreadTraceImpl<Mapping44>(tid);
     case 46: return GetThreadTraceImpl<Mapping46>(tid);
     case 47: return GetThreadTraceImpl<Mapping47>(tid);
   }
   DCHECK(0);
   return 0;
 #else
   return GetThreadTraceImpl<Mapping>(tid);
 #endif
 }
 
 
 template<typename Mapping>
 uptr GetThreadTraceHeaderImpl(int tid) {
   uptr p = Mapping::kTraceMemBeg + (uptr)tid * kTotalTraceSize
       + kTraceSize * sizeof(Event);
   DCHECK_LT(p, Mapping::kTraceMemEnd);
   return p;
 }
 
 ALWAYS_INLINE
 uptr GetThreadTraceHeader(int tid) {
 #if defined(__aarch64__) && !defined(__APPLE__)
   switch (vmaSize) {
     case 39: return GetThreadTraceHeaderImpl<Mapping39>(tid);
     case 42: return GetThreadTraceHeaderImpl<Mapping42>(tid);
     case 48: return GetThreadTraceHeaderImpl<Mapping48>(tid);
   }
   DCHECK(0);
   return 0;
 #elif defined(__powerpc64__)
   switch (vmaSize) {
     case 44: return GetThreadTraceHeaderImpl<Mapping44>(tid);
     case 46: return GetThreadTraceHeaderImpl<Mapping46>(tid);
     case 47: return GetThreadTraceHeaderImpl<Mapping47>(tid);
   }
   DCHECK(0);
   return 0;
 #else
   return GetThreadTraceHeaderImpl<Mapping>(tid);
 #endif
 }
 
 void InitializePlatform();
 void InitializePlatformEarly();
 void CheckAndProtect();
 void InitializeShadowMemoryPlatform();
 void FlushShadowMemory();
 void WriteMemoryProfile(char *buf, uptr buf_size, uptr nthread, uptr nlive);
 int ExtractResolvFDs(void *state, int *fds, int nfd);
 int ExtractRecvmsgFDs(void *msg, int *fds, int nfd);
 void ImitateTlsWrite(ThreadState *thr, uptr tls_addr, uptr tls_size);
 
 int call_pthread_cancel_with_cleanup(int(*fn)(void *c, void *m,
     void *abstime), void *c, void *m, void *abstime,
     void(*cleanup)(void *arg), void *arg);
 
 void DestroyThreadState();
 
 }  // namespace __tsan
 
 #endif  // TSAN_PLATFORM_H
Index: head/contrib/compiler-rt
===================================================================
--- head/contrib/compiler-rt	(revision 328752)
+++ head/contrib/compiler-rt	(revision 328753)

Property changes on: head/contrib/compiler-rt
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/compiler-rt/dist-release_60:r328374-328750
Index: head/contrib/libc++
===================================================================
--- head/contrib/libc++	(revision 328752)
+++ head/contrib/libc++	(revision 328753)

Property changes on: head/contrib/libc++
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/libc++/dist-release_60:r328374-328750
Index: head/contrib/llvm/include/llvm/Analysis/ValueTracking.h
===================================================================
--- head/contrib/llvm/include/llvm/Analysis/ValueTracking.h	(revision 328752)
+++ head/contrib/llvm/include/llvm/Analysis/ValueTracking.h	(revision 328753)
@@ -1,538 +1,539 @@
 //===- llvm/Analysis/ValueTracking.h - Walk computations --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains routines that help analyze properties that chains of
 // computations have.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_ANALYSIS_VALUETRACKING_H
 #define LLVM_ANALYSIS_VALUETRACKING_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Intrinsics.h"
 #include <cassert>
 #include <cstdint>
 
 namespace llvm {
 
 class AddOperator;
 class APInt;
 class AssumptionCache;
 class DataLayout;
 class DominatorTree;
 class GEPOperator;
 class IntrinsicInst;
 struct KnownBits;
 class Loop;
 class LoopInfo;
 class MDNode;
 class OptimizationRemarkEmitter;
 class StringRef;
 class TargetLibraryInfo;
 class Value;
 
   /// Determine which bits of V are known to be either zero or one and return
   /// them in the KnownZero/KnownOne bit sets.
   ///
   /// This function is defined on values with integer type, values with pointer
   /// type, and vectors of integers.  In the case
   /// where V is a vector, the known zero and known one values are the
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
   void computeKnownBits(const Value *V, KnownBits &Known,
                         const DataLayout &DL, unsigned Depth = 0,
                         AssumptionCache *AC = nullptr,
                         const Instruction *CxtI = nullptr,
                         const DominatorTree *DT = nullptr,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
   /// Returns the known bits rather than passing by reference.
   KnownBits computeKnownBits(const Value *V, const DataLayout &DL,
                              unsigned Depth = 0, AssumptionCache *AC = nullptr,
                              const Instruction *CxtI = nullptr,
                              const DominatorTree *DT = nullptr,
                              OptimizationRemarkEmitter *ORE = nullptr);
 
   /// Compute known bits from the range metadata.
   /// \p KnownZero the set of bits that are known to be zero
   /// \p KnownOne the set of bits that are known to be one
   void computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
                                          KnownBits &Known);
 
   /// Return true if LHS and RHS have no common bits set.
   bool haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
                            const DataLayout &DL,
                            AssumptionCache *AC = nullptr,
                            const Instruction *CxtI = nullptr,
                            const DominatorTree *DT = nullptr);
 
   /// Return true if the given value is known to have exactly one bit set when
   /// defined. For vectors return true if every element is known to be a power
   /// of two when defined. Supports values with integer or pointer type and
   /// vectors of integers. If 'OrZero' is set, then return true if the given
   /// value is either a power of two or zero.
   bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
                               bool OrZero = false, unsigned Depth = 0,
                               AssumptionCache *AC = nullptr,
                               const Instruction *CxtI = nullptr,
                               const DominatorTree *DT = nullptr);
 
   bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI);
 
   /// Return true if the given value is known to be non-zero when defined. For
   /// vectors, return true if every element is known to be non-zero when
   /// defined. For pointers, if the context instruction and dominator tree are
   /// specified, perform context-sensitive analysis and return true if the
   /// pointer couldn't possibly be null at the specified instruction.
   /// Supports values with integer or pointer type and vectors of integers.
   bool isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth = 0,
                       AssumptionCache *AC = nullptr,
                       const Instruction *CxtI = nullptr,
                       const DominatorTree *DT = nullptr);
 
   /// Returns true if the give value is known to be non-negative.
   bool isKnownNonNegative(const Value *V, const DataLayout &DL,
                           unsigned Depth = 0,
                           AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr,
                           const DominatorTree *DT = nullptr);
 
   /// Returns true if the given value is known be positive (i.e. non-negative
   /// and non-zero).
   bool isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth = 0,
                        AssumptionCache *AC = nullptr,
                        const Instruction *CxtI = nullptr,
                        const DominatorTree *DT = nullptr);
 
   /// Returns true if the given value is known be negative (i.e. non-positive
   /// and non-zero).
   bool isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth = 0,
                        AssumptionCache *AC = nullptr,
                        const Instruction *CxtI = nullptr,
                        const DominatorTree *DT = nullptr);
 
   /// Return true if the given values are known to be non-equal when defined.
   /// Supports scalar integer types only.
   bool isKnownNonEqual(const Value *V1, const Value *V2, const DataLayout &DL,
                       AssumptionCache *AC = nullptr,
                       const Instruction *CxtI = nullptr,
                       const DominatorTree *DT = nullptr);
 
   /// Return true if 'V & Mask' is known to be zero. We use this predicate to
   /// simplify operations downstream. Mask is known to be zero for bits that V
   /// cannot have.
   ///
   /// This function is defined on values with integer type, values with pointer
   /// type, and vectors of integers.  In the case
   /// where V is a vector, the mask, known zero, and known one values are the
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
   bool MaskedValueIsZero(const Value *V, const APInt &Mask,
                          const DataLayout &DL,
                          unsigned Depth = 0, AssumptionCache *AC = nullptr,
                          const Instruction *CxtI = nullptr,
                          const DominatorTree *DT = nullptr);
 
   /// Return the number of times the sign bit of the register is replicated into
   /// the other bits. We know that at least 1 bit is always equal to the sign
   /// bit (itself), but other cases can give us information. For example,
   /// immediately after an "ashr X, 2", we know that the top 3 bits are all
   /// equal to each other, so we return 3. For vectors, return the number of
   /// sign bits for the vector element with the mininum number of known sign
   /// bits.
   unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL,
                               unsigned Depth = 0, AssumptionCache *AC = nullptr,
                               const Instruction *CxtI = nullptr,
                               const DominatorTree *DT = nullptr);
 
   /// This function computes the integer multiple of Base that equals V. If
   /// successful, it returns true and returns the multiple in Multiple. If
   /// unsuccessful, it returns false. Also, if V can be simplified to an
   /// integer, then the simplified V is returned in Val. Look through sext only
   /// if LookThroughSExt=true.
   bool ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
                        bool LookThroughSExt = false,
                        unsigned Depth = 0);
 
   /// Map a call instruction to an intrinsic ID.  Libcalls which have equivalent
   /// intrinsics are treated as-if they were intrinsics.
   Intrinsic::ID getIntrinsicForCallSite(ImmutableCallSite ICS,
                                         const TargetLibraryInfo *TLI);
 
   /// Return true if we can prove that the specified FP value is never equal to
   /// -0.0.
   bool CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
                             unsigned Depth = 0);
 
   /// Return true if we can prove that the specified FP value is either NaN or
   /// never less than -0.0.
   ///
   ///      NaN --> true
   ///       +0 --> true
   ///       -0 --> true
   ///   x > +0 --> true
   ///   x < -0 --> false
   bool CannotBeOrderedLessThanZero(const Value *V, const TargetLibraryInfo *TLI);
 
   /// Return true if the floating-point scalar value is not a NaN or if the
   /// floating-point vector value has no NaN elements. Return false if a value
   /// could ever be NaN.
   bool isKnownNeverNaN(const Value *V);
 
   /// Return true if we can prove that the specified FP value's sign bit is 0.
   ///
   ///      NaN --> true/false (depending on the NaN's sign bit)
   ///       +0 --> true
   ///       -0 --> false
   ///   x > +0 --> true
   ///   x < -0 --> false
   bool SignBitMustBeZero(const Value *V, const TargetLibraryInfo *TLI);
 
   /// If the specified value can be set by repeating the same byte in memory,
   /// return the i8 value that it is represented with. This is true for all i8
   /// values obviously, but is also true for i32 0, i32 -1, i16 0xF0F0, double
   /// 0.0 etc. If the value can't be handled with a repeated byte store (e.g.
   /// i16 0x1234), return null.
   Value *isBytewiseValue(Value *V);
 
   /// Given an aggregrate and an sequence of indices, see if the scalar value
   /// indexed is already around as a register, for example if it were inserted
   /// directly into the aggregrate.
   ///
   /// If InsertBefore is not null, this function will duplicate (modified)
   /// insertvalues when a part of a nested struct is extracted.
   Value *FindInsertedValue(Value *V,
                            ArrayRef<unsigned> idx_range,
                            Instruction *InsertBefore = nullptr);
 
   /// Analyze the specified pointer to see if it can be expressed as a base
   /// pointer plus a constant offset. Return the base and offset to the caller.
   Value *GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
                                           const DataLayout &DL);
   inline const Value *GetPointerBaseWithConstantOffset(const Value *Ptr,
                                                        int64_t &Offset,
                                                        const DataLayout &DL) {
     return GetPointerBaseWithConstantOffset(const_cast<Value *>(Ptr), Offset,
                                             DL);
   }
 
   /// Returns true if the GEP is based on a pointer to a string (array of
   // \p CharSize integers) and is indexing into this string.
   bool isGEPBasedOnPointerToString(const GEPOperator *GEP,
                                    unsigned CharSize = 8);
 
   /// Represents offset+length into a ConstantDataArray.
   struct ConstantDataArraySlice {
     /// ConstantDataArray pointer. nullptr indicates a zeroinitializer (a valid
     /// initializer, it just doesn't fit the ConstantDataArray interface).
     const ConstantDataArray *Array;
 
     /// Slice starts at this Offset.
     uint64_t Offset;
 
     /// Length of the slice.
     uint64_t Length;
 
     /// Moves the Offset and adjusts Length accordingly.
     void move(uint64_t Delta) {
       assert(Delta < Length);
       Offset += Delta;
       Length -= Delta;
     }
 
     /// Convenience accessor for elements in the slice.
     uint64_t operator[](unsigned I) const {
       return Array==nullptr ? 0 : Array->getElementAsInteger(I + Offset);
     }
   };
 
   /// Returns true if the value \p V is a pointer into a ConstantDataArray.
   /// If successful \p Slice will point to a ConstantDataArray info object
   /// with an appropriate offset.
   bool getConstantDataArrayInfo(const Value *V, ConstantDataArraySlice &Slice,
                                 unsigned ElementSize, uint64_t Offset = 0);
 
   /// This function computes the length of a null-terminated C string pointed to
   /// by V. If successful, it returns true and returns the string in Str. If
   /// unsuccessful, it returns false. This does not include the trailing null
   /// character by default. If TrimAtNul is set to false, then this returns any
   /// trailing null characters as well as any other characters that come after
   /// it.
   bool getConstantStringInfo(const Value *V, StringRef &Str,
                              uint64_t Offset = 0, bool TrimAtNul = true);
 
   /// If we can compute the length of the string pointed to by the specified
   /// pointer, return 'len+1'.  If we can't, return 0.
   uint64_t GetStringLength(const Value *V, unsigned CharSize = 8);
 
   /// This method strips off any GEP address adjustments and pointer casts from
   /// the specified value, returning the original object being addressed. Note
   /// that the returned value has pointer type if the specified value does. If
   /// the MaxLookup value is non-zero, it limits the number of instructions to
   /// be stripped off.
   Value *GetUnderlyingObject(Value *V, const DataLayout &DL,
                              unsigned MaxLookup = 6);
   inline const Value *GetUnderlyingObject(const Value *V, const DataLayout &DL,
                                           unsigned MaxLookup = 6) {
     return GetUnderlyingObject(const_cast<Value *>(V), DL, MaxLookup);
   }
 
   /// \brief This method is similar to GetUnderlyingObject except that it can
   /// look through phi and select instructions and return multiple objects.
   ///
   /// If LoopInfo is passed, loop phis are further analyzed.  If a pointer
   /// accesses different objects in each iteration, we don't look through the
   /// phi node. E.g. consider this loop nest:
   ///
   ///   int **A;
   ///   for (i)
   ///     for (j) {
   ///        A[i][j] = A[i-1][j] * B[j]
   ///     }
   ///
   /// This is transformed by Load-PRE to stash away A[i] for the next iteration
   /// of the outer loop:
   ///
   ///   Curr = A[0];          // Prev_0
   ///   for (i: 1..N) {
   ///     Prev = Curr;        // Prev = PHI (Prev_0, Curr)
   ///     Curr = A[i];
   ///     for (j: 0..N) {
   ///        Curr[j] = Prev[j] * B[j]
   ///     }
   ///   }
   ///
   /// Since A[i] and A[i-1] are independent pointers, getUnderlyingObjects
   /// should not assume that Curr and Prev share the same underlying object thus
   /// it shouldn't look through the phi above.
   void GetUnderlyingObjects(Value *V, SmallVectorImpl<Value *> &Objects,
                             const DataLayout &DL, LoopInfo *LI = nullptr,
                             unsigned MaxLookup = 6);
 
   /// This is a wrapper around GetUnderlyingObjects and adds support for basic
   /// ptrtoint+arithmetic+inttoptr sequences.
   bool getUnderlyingObjectsForCodeGen(const Value *V,
                             SmallVectorImpl<Value *> &Objects,
                             const DataLayout &DL);
 
   /// Return true if the only users of this pointer are lifetime markers.
   bool onlyUsedByLifetimeMarkers(const Value *V);
 
   /// Return true if the instruction does not have any effects besides
   /// calculating the result and does not have undefined behavior.
   ///
   /// This method never returns true for an instruction that returns true for
   /// mayHaveSideEffects; however, this method also does some other checks in
   /// addition. It checks for undefined behavior, like dividing by zero or
   /// loading from an invalid pointer (but not for undefined results, like a
   /// shift with a shift amount larger than the width of the result). It checks
   /// for malloc and alloca because speculatively executing them might cause a
   /// memory leak. It also returns false for instructions related to control
   /// flow, specifically terminators and PHI nodes.
   ///
   /// If the CtxI is specified this method performs context-sensitive analysis
   /// and returns true if it is safe to execute the instruction immediately
   /// before the CtxI.
   ///
   /// If the CtxI is NOT specified this method only looks at the instruction
   /// itself and its operands, so if this method returns true, it is safe to
   /// move the instruction as long as the correct dominance relationships for
   /// the operands and users hold.
   ///
   /// This method can return true for instructions that read memory;
   /// for such instructions, moving them may change the resulting value.
   bool isSafeToSpeculativelyExecute(const Value *V,
                                     const Instruction *CtxI = nullptr,
                                     const DominatorTree *DT = nullptr);
 
   /// Returns true if the result or effects of the given instructions \p I
   /// depend on or influence global memory.
   /// Memory dependence arises for example if the instruction reads from
   /// memory or may produce effects or undefined behaviour. Memory dependent
   /// instructions generally cannot be reorderd with respect to other memory
   /// dependent instructions or moved into non-dominated basic blocks.
   /// Instructions which just compute a value based on the values of their
   /// operands are not memory dependent.
   bool mayBeMemoryDependent(const Instruction &I);
 
   /// Return true if it is an intrinsic that cannot be speculated but also
   /// cannot trap.
   bool isAssumeLikeIntrinsic(const Instruction *I);
 
   /// Return true if it is valid to use the assumptions provided by an
   /// assume intrinsic, I, at the point in the control-flow identified by the
   /// context instruction, CxtI.
   bool isValidAssumeForContext(const Instruction *I, const Instruction *CxtI,
                                const DominatorTree *DT = nullptr);
 
   enum class OverflowResult { AlwaysOverflows, MayOverflow, NeverOverflows };
 
   OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
                                                const Value *RHS,
                                                const DataLayout &DL,
                                                AssumptionCache *AC,
                                                const Instruction *CxtI,
                                                const DominatorTree *DT);
   OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
                                                const Value *RHS,
                                                const DataLayout &DL,
                                                AssumptionCache *AC,
                                                const Instruction *CxtI,
                                                const DominatorTree *DT);
   OverflowResult computeOverflowForSignedAdd(const Value *LHS, const Value *RHS,
                                              const DataLayout &DL,
                                              AssumptionCache *AC = nullptr,
                                              const Instruction *CxtI = nullptr,
                                              const DominatorTree *DT = nullptr);
   /// This version also leverages the sign bit of Add if known.
   OverflowResult computeOverflowForSignedAdd(const AddOperator *Add,
                                              const DataLayout &DL,
                                              AssumptionCache *AC = nullptr,
                                              const Instruction *CxtI = nullptr,
                                              const DominatorTree *DT = nullptr);
 
   /// Returns true if the arithmetic part of the \p II 's result is
   /// used only along the paths control dependent on the computation
   /// not overflowing, \p II being an <op>.with.overflow intrinsic.
   bool isOverflowIntrinsicNoWrap(const IntrinsicInst *II,
                                  const DominatorTree &DT);
 
   /// Return true if this function can prove that the instruction I will
   /// always transfer execution to one of its successors (including the next
   /// instruction that follows within a basic block). E.g. this is not
   /// guaranteed for function calls that could loop infinitely.
   ///
   /// In other words, this function returns false for instructions that may
   /// transfer execution or fail to transfer execution in a way that is not
   /// captured in the CFG nor in the sequence of instructions within a basic
   /// block.
   ///
   /// Undefined behavior is assumed not to happen, so e.g. division is
   /// guaranteed to transfer execution to the following instruction even
   /// though division by zero might cause undefined behavior.
   bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I);
 
   /// Return true if this function can prove that the instruction I
   /// is executed for every iteration of the loop L.
   ///
   /// Note that this currently only considers the loop header.
   bool isGuaranteedToExecuteForEveryIteration(const Instruction *I,
                                               const Loop *L);
 
   /// Return true if this function can prove that I is guaranteed to yield
   /// full-poison (all bits poison) if at least one of its operands are
   /// full-poison (all bits poison).
   ///
   /// The exact rules for how poison propagates through instructions have
   /// not been settled as of 2015-07-10, so this function is conservative
   /// and only considers poison to be propagated in uncontroversial
   /// cases. There is no attempt to track values that may be only partially
   /// poison.
   bool propagatesFullPoison(const Instruction *I);
 
   /// Return either nullptr or an operand of I such that I will trigger
   /// undefined behavior if I is executed and that operand has a full-poison
   /// value (all bits poison).
   const Value *getGuaranteedNonFullPoisonOp(const Instruction *I);
 
   /// Return true if this function can prove that if PoisonI is executed
   /// and yields a full-poison value (all bits poison), then that will
   /// trigger undefined behavior.
   ///
   /// Note that this currently only considers the basic block that is
   /// the parent of I.
   bool programUndefinedIfFullPoison(const Instruction *PoisonI);
 
   /// \brief Specific patterns of select instructions we can match.
   enum SelectPatternFlavor {
     SPF_UNKNOWN = 0,
     SPF_SMIN,                   /// Signed minimum
     SPF_UMIN,                   /// Unsigned minimum
     SPF_SMAX,                   /// Signed maximum
     SPF_UMAX,                   /// Unsigned maximum
     SPF_FMINNUM,                /// Floating point minnum
     SPF_FMAXNUM,                /// Floating point maxnum
     SPF_ABS,                    /// Absolute value
     SPF_NABS                    /// Negated absolute value
   };
 
   /// \brief Behavior when a floating point min/max is given one NaN and one
   /// non-NaN as input.
   enum SelectPatternNaNBehavior {
     SPNB_NA = 0,                /// NaN behavior not applicable.
     SPNB_RETURNS_NAN,           /// Given one NaN input, returns the NaN.
     SPNB_RETURNS_OTHER,         /// Given one NaN input, returns the non-NaN.
     SPNB_RETURNS_ANY            /// Given one NaN input, can return either (or
                                 /// it has been determined that no operands can
                                 /// be NaN).
   };
 
   struct SelectPatternResult {
     SelectPatternFlavor Flavor;
     SelectPatternNaNBehavior NaNBehavior; /// Only applicable if Flavor is
                                           /// SPF_FMINNUM or SPF_FMAXNUM.
     bool Ordered;               /// When implementing this min/max pattern as
                                 /// fcmp; select, does the fcmp have to be
                                 /// ordered?
 
     /// \brief Return true if \p SPF is a min or a max pattern.
     static bool isMinOrMax(SelectPatternFlavor SPF) {
       return !(SPF == SPF_UNKNOWN || SPF == SPF_ABS || SPF == SPF_NABS);
     }
   };
 
   /// Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind
   /// and providing the out parameter results if we successfully match.
   ///
   /// If CastOp is not nullptr, also match MIN/MAX idioms where the type does
   /// not match that of the original select. If this is the case, the cast
   /// operation (one of Trunc,SExt,Zext) that must be done to transform the
   /// type of LHS and RHS into the type of V is returned in CastOp.
   ///
   /// For example:
   ///   %1 = icmp slt i32 %a, i32 4
   ///   %2 = sext i32 %a to i64
   ///   %3 = select i1 %1, i64 %2, i64 4
   ///
   /// -> LHS = %a, RHS = i32 4, *CastOp = Instruction::SExt
   ///
   SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
-                                         Instruction::CastOps *CastOp = nullptr);
+                                         Instruction::CastOps *CastOp = nullptr,
+                                         unsigned Depth = 0);
   inline SelectPatternResult
   matchSelectPattern(const Value *V, const Value *&LHS, const Value *&RHS,
                      Instruction::CastOps *CastOp = nullptr) {
     Value *L = const_cast<Value*>(LHS);
     Value *R = const_cast<Value*>(RHS);
     auto Result = matchSelectPattern(const_cast<Value*>(V), L, R);
     LHS = L;
     RHS = R;
     return Result;
   }
 
   /// Return true if RHS is known to be implied true by LHS.  Return false if
   /// RHS is known to be implied false by LHS.  Otherwise, return None if no
   /// implication can be made.
   /// A & B must be i1 (boolean) values or a vector of such values. Note that
   /// the truth table for implication is the same as <=u on i1 values (but not
   /// <=s!).  The truth table for both is:
   ///    | T | F (B)
   ///  T | T | F
   ///  F | T | T
   /// (A)
   Optional<bool> isImpliedCondition(const Value *LHS, const Value *RHS,
                                     const DataLayout &DL, bool LHSIsTrue = true,
                                     unsigned Depth = 0);
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_VALUETRACKING_H
Index: head/contrib/llvm/lib/Analysis/ValueTracking.cpp
===================================================================
--- head/contrib/llvm/lib/Analysis/ValueTracking.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Analysis/ValueTracking.cpp	(revision 328753)
@@ -1,4856 +1,4863 @@
 //===- ValueTracking.cpp - Walk computations to compute properties --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains routines that help analyze properties that chains of
 // computations have.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <array>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <utility>     
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 const unsigned MaxDepth = 6;
 
 // Controls the number of uses of the value searched for possible
 // dominating comparisons.
 static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
                                               cl::Hidden, cl::init(20));
 
 /// Returns the bitwidth of the given scalar or pointer type. For vector types,
 /// returns the element type's bitwidth.
 static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
 
   return DL.getPointerTypeSizeInBits(Ty);
 }
 
 namespace {
 
 // Simplifying using an assume can only be done in a particular control-flow
 // context (the context instruction provides that context). If an assume and
 // the context instruction are not in the same block then the DT helps in
 // figuring out if we can use it.
 struct Query {
   const DataLayout &DL;
   AssumptionCache *AC;
   const Instruction *CxtI;
   const DominatorTree *DT;
 
   // Unlike the other analyses, this may be a nullptr because not all clients
   // provide it currently.
   OptimizationRemarkEmitter *ORE;
 
   /// Set of assumptions that should be excluded from further queries.
   /// This is because of the potential for mutual recursion to cause
   /// computeKnownBits to repeatedly visit the same assume intrinsic. The
   /// classic case of this is assume(x = y), which will attempt to determine
   /// bits in x from bits in y, which will attempt to determine bits in y from
   /// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
   /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
   /// (all of which can call computeKnownBits), and so on.
   std::array<const Value *, MaxDepth> Excluded;
 
   unsigned NumExcluded = 0;
 
   Query(const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI,
         const DominatorTree *DT, OptimizationRemarkEmitter *ORE = nullptr)
       : DL(DL), AC(AC), CxtI(CxtI), DT(DT), ORE(ORE) {}
 
   Query(const Query &Q, const Value *NewExcl)
       : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), ORE(Q.ORE),
         NumExcluded(Q.NumExcluded) {
     Excluded = Q.Excluded;
     Excluded[NumExcluded++] = NewExcl;
     assert(NumExcluded <= Excluded.size());
   }
 
   bool isExcluded(const Value *Value) const {
     if (NumExcluded == 0)
       return false;
     auto End = Excluded.begin() + NumExcluded;
     return std::find(Excluded.begin(), End, Value) != End;
   }
 };
 
 } // end anonymous namespace
 
 // Given the provided Value and, potentially, a context instruction, return
 // the preferred context instruction (if any).
 static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) {
   // If we've been provided with a context instruction, then use that (provided
   // it has been inserted).
   if (CxtI && CxtI->getParent())
     return CxtI;
 
   // If the value is really an already-inserted instruction, then use that.
   CxtI = dyn_cast<Instruction>(V);
   if (CxtI && CxtI->getParent())
     return CxtI;
 
   return nullptr;
 }
 
 static void computeKnownBits(const Value *V, KnownBits &Known,
                              unsigned Depth, const Query &Q);
 
 void llvm::computeKnownBits(const Value *V, KnownBits &Known,
                             const DataLayout &DL, unsigned Depth,
                             AssumptionCache *AC, const Instruction *CxtI,
                             const DominatorTree *DT,
                             OptimizationRemarkEmitter *ORE) {
   ::computeKnownBits(V, Known, Depth,
                      Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
 static KnownBits computeKnownBits(const Value *V, unsigned Depth,
                                   const Query &Q);
 
 KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
                                  unsigned Depth, AssumptionCache *AC,
                                  const Instruction *CxtI,
                                  const DominatorTree *DT,
                                  OptimizationRemarkEmitter *ORE) {
   return ::computeKnownBits(V, Depth,
                             Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
                                const DataLayout &DL,
                                AssumptionCache *AC, const Instruction *CxtI,
                                const DominatorTree *DT) {
   assert(LHS->getType() == RHS->getType() &&
          "LHS and RHS should have the same type");
   assert(LHS->getType()->isIntOrIntVectorTy() &&
          "LHS and RHS should be integers");
   IntegerType *IT = cast<IntegerType>(LHS->getType()->getScalarType());
   KnownBits LHSKnown(IT->getBitWidth());
   KnownBits RHSKnown(IT->getBitWidth());
   computeKnownBits(LHS, LHSKnown, DL, 0, AC, CxtI, DT);
   computeKnownBits(RHS, RHSKnown, DL, 0, AC, CxtI, DT);
   return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue();
 }
 
 bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
   for (const User *U : CxtI->users()) {
     if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
       if (IC->isEquality())
         if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
           if (C->isNullValue())
             continue;
     return false;
   }
   return true;
 }
 
 static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                                    const Query &Q);
 
 bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
                                   bool OrZero,
                                   unsigned Depth, AssumptionCache *AC,
                                   const Instruction *CxtI,
                                   const DominatorTree *DT) {
   return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth,
                                   Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 static bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q);
 
 bool llvm::isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth,
                           AssumptionCache *AC, const Instruction *CxtI,
                           const DominatorTree *DT) {
   return ::isKnownNonZero(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
                               unsigned Depth,
                               AssumptionCache *AC, const Instruction *CxtI,
                               const DominatorTree *DT) {
   KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
   return Known.isNonNegative();
 }
 
 bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
   if (auto *CI = dyn_cast<ConstantInt>(V))
     return CI->getValue().isStrictlyPositive();
 
   // TODO: We'd doing two recursive queries here.  We should factor this such
   // that only a single query is needed.
   return isKnownNonNegative(V, DL, Depth, AC, CxtI, DT) &&
     isKnownNonZero(V, DL, Depth, AC, CxtI, DT);
 }
 
 bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
   KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
   return Known.isNegative();
 }
 
 static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q);
 
 bool llvm::isKnownNonEqual(const Value *V1, const Value *V2,
                            const DataLayout &DL,
                            AssumptionCache *AC, const Instruction *CxtI,
                            const DominatorTree *DT) {
   return ::isKnownNonEqual(V1, V2, Query(DL, AC,
                                          safeCxtI(V1, safeCxtI(V2, CxtI)),
                                          DT));
 }
 
 static bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
                               const Query &Q);
 
 bool llvm::MaskedValueIsZero(const Value *V, const APInt &Mask,
                              const DataLayout &DL,
                              unsigned Depth, AssumptionCache *AC,
                              const Instruction *CxtI, const DominatorTree *DT) {
   return ::MaskedValueIsZero(V, Mask, Depth,
                              Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
                                    const Query &Q);
 
 unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
                                   unsigned Depth, AssumptionCache *AC,
                                   const Instruction *CxtI,
                                   const DominatorTree *DT) {
   return ::ComputeNumSignBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
 }
 
 static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
                                    bool NSW,
                                    KnownBits &KnownOut, KnownBits &Known2,
                                    unsigned Depth, const Query &Q) {
   unsigned BitWidth = KnownOut.getBitWidth();
 
   // If an initial sequence of bits in the result is not needed, the
   // corresponding bits in the operands are not needed.
   KnownBits LHSKnown(BitWidth);
   computeKnownBits(Op0, LHSKnown, Depth + 1, Q);
   computeKnownBits(Op1, Known2, Depth + 1, Q);
 
   KnownOut = KnownBits::computeForAddSub(Add, NSW, LHSKnown, Known2);
 }
 
 static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
                                 KnownBits &Known, KnownBits &Known2,
                                 unsigned Depth, const Query &Q) {
   unsigned BitWidth = Known.getBitWidth();
   computeKnownBits(Op1, Known, Depth + 1, Q);
   computeKnownBits(Op0, Known2, Depth + 1, Q);
 
   bool isKnownNegative = false;
   bool isKnownNonNegative = false;
   // If the multiplication is known not to overflow, compute the sign bit.
   if (NSW) {
     if (Op0 == Op1) {
       // The product of a number with itself is non-negative.
       isKnownNonNegative = true;
     } else {
       bool isKnownNonNegativeOp1 = Known.isNonNegative();
       bool isKnownNonNegativeOp0 = Known2.isNonNegative();
       bool isKnownNegativeOp1 = Known.isNegative();
       bool isKnownNegativeOp0 = Known2.isNegative();
       // The product of two numbers with the same sign is non-negative.
       isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) ||
         (isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
       // The product of a negative number and a non-negative number is either
       // negative or zero.
       if (!isKnownNonNegative)
         isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
                            isKnownNonZero(Op0, Depth, Q)) ||
                           (isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
                            isKnownNonZero(Op1, Depth, Q));
     }
   }
 
   assert(!Known.hasConflict() && !Known2.hasConflict());
   // Compute a conservative estimate for high known-0 bits.
   unsigned LeadZ =  std::max(Known.countMinLeadingZeros() +
                              Known2.countMinLeadingZeros(),
                              BitWidth) - BitWidth;
   LeadZ = std::min(LeadZ, BitWidth);
 
   // The result of the bottom bits of an integer multiply can be
   // inferred by looking at the bottom bits of both operands and
   // multiplying them together.
   // We can infer at least the minimum number of known trailing bits
   // of both operands. Depending on number of trailing zeros, we can
   // infer more bits, because (a*b) <=> ((a/m) * (b/n)) * (m*n) assuming
   // a and b are divisible by m and n respectively.
   // We then calculate how many of those bits are inferrable and set
   // the output. For example, the i8 mul:
   //  a = XXXX1100 (12)
   //  b = XXXX1110 (14)
   // We know the bottom 3 bits are zero since the first can be divided by
   // 4 and the second by 2, thus having ((12/4) * (14/2)) * (2*4).
   // Applying the multiplication to the trimmed arguments gets:
   //    XX11 (3)
   //    X111 (7)
   // -------
   //    XX11
   //   XX11
   //  XX11
   // XX11
   // -------
   // XXXXX01
   // Which allows us to infer the 2 LSBs. Since we're multiplying the result
   // by 8, the bottom 3 bits will be 0, so we can infer a total of 5 bits.
   // The proof for this can be described as:
   // Pre: (C1 >= 0) && (C1 < (1 << C5)) && (C2 >= 0) && (C2 < (1 << C6)) &&
   //      (C7 == (1 << (umin(countTrailingZeros(C1), C5) +
   //                    umin(countTrailingZeros(C2), C6) +
   //                    umin(C5 - umin(countTrailingZeros(C1), C5),
   //                         C6 - umin(countTrailingZeros(C2), C6)))) - 1)
   // %aa = shl i8 %a, C5
   // %bb = shl i8 %b, C6
   // %aaa = or i8 %aa, C1
   // %bbb = or i8 %bb, C2
   // %mul = mul i8 %aaa, %bbb
   // %mask = and i8 %mul, C7
   //   =>
   // %mask = i8 ((C1*C2)&C7)
   // Where C5, C6 describe the known bits of %a, %b
   // C1, C2 describe the known bottom bits of %a, %b.
   // C7 describes the mask of the known bits of the result.
   APInt Bottom0 = Known.One;
   APInt Bottom1 = Known2.One;
 
   // How many times we'd be able to divide each argument by 2 (shr by 1).
   // This gives us the number of trailing zeros on the multiplication result.
   unsigned TrailBitsKnown0 = (Known.Zero | Known.One).countTrailingOnes();
   unsigned TrailBitsKnown1 = (Known2.Zero | Known2.One).countTrailingOnes();
   unsigned TrailZero0 = Known.countMinTrailingZeros();
   unsigned TrailZero1 = Known2.countMinTrailingZeros();
   unsigned TrailZ = TrailZero0 + TrailZero1;
 
   // Figure out the fewest known-bits operand.
   unsigned SmallestOperand = std::min(TrailBitsKnown0 - TrailZero0,
                                       TrailBitsKnown1 - TrailZero1);
   unsigned ResultBitsKnown = std::min(SmallestOperand + TrailZ, BitWidth);
 
   APInt BottomKnown = Bottom0.getLoBits(TrailBitsKnown0) *
                       Bottom1.getLoBits(TrailBitsKnown1);
 
   Known.resetAll();
   Known.Zero.setHighBits(LeadZ);
   Known.Zero |= (~BottomKnown).getLoBits(ResultBitsKnown);
   Known.One |= BottomKnown.getLoBits(ResultBitsKnown);
 
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
   // which case we prefer to follow the result of the direct computation,
   // though as the program is invoking undefined behaviour we can choose
   // whatever we like here.
   if (isKnownNonNegative && !Known.isNegative())
     Known.makeNonNegative();
   else if (isKnownNegative && !Known.isNonNegative())
     Known.makeNegative();
 }
 
 void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
                                              KnownBits &Known) {
   unsigned BitWidth = Known.getBitWidth();
   unsigned NumRanges = Ranges.getNumOperands() / 2;
   assert(NumRanges >= 1);
 
   Known.Zero.setAllBits();
   Known.One.setAllBits();
 
   for (unsigned i = 0; i < NumRanges; ++i) {
     ConstantInt *Lower =
         mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0));
     ConstantInt *Upper =
         mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1));
     ConstantRange Range(Lower->getValue(), Upper->getValue());
 
     // The first CommonPrefixBits of all values in Range are equal.
     unsigned CommonPrefixBits =
         (Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros();
 
     APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits);
     Known.One &= Range.getUnsignedMax() & Mask;
     Known.Zero &= ~Range.getUnsignedMax() & Mask;
   }
 }
 
 static bool isEphemeralValueOf(const Instruction *I, const Value *E) {
   SmallVector<const Value *, 16> WorkSet(1, I);
   SmallPtrSet<const Value *, 32> Visited;
   SmallPtrSet<const Value *, 16> EphValues;
 
   // The instruction defining an assumption's condition itself is always
   // considered ephemeral to that assumption (even if it has other
   // non-ephemeral users). See r246696's test case for an example.
   if (is_contained(I->operands(), E))
     return true;
 
   while (!WorkSet.empty()) {
     const Value *V = WorkSet.pop_back_val();
     if (!Visited.insert(V).second)
       continue;
 
     // If all uses of this value are ephemeral, then so is this value.
     if (llvm::all_of(V->users(), [&](const User *U) {
                                    return EphValues.count(U);
                                  })) {
       if (V == E)
         return true;
 
       if (V == I || isSafeToSpeculativelyExecute(V)) {
        EphValues.insert(V);
        if (const User *U = dyn_cast<User>(V))
          for (User::const_op_iterator J = U->op_begin(), JE = U->op_end();
               J != JE; ++J)
            WorkSet.push_back(*J);
       }
     }
   }
 
   return false;
 }
 
 // Is this an intrinsic that cannot be speculated but also cannot trap?
 bool llvm::isAssumeLikeIntrinsic(const Instruction *I) {
   if (const CallInst *CI = dyn_cast<CallInst>(I))
     if (Function *F = CI->getCalledFunction())
       switch (F->getIntrinsicID()) {
       default: break;
       // FIXME: This list is repeated from NoTTI::getIntrinsicCost.
       case Intrinsic::assume:
       case Intrinsic::sideeffect:
       case Intrinsic::dbg_declare:
       case Intrinsic::dbg_value:
       case Intrinsic::invariant_start:
       case Intrinsic::invariant_end:
       case Intrinsic::lifetime_start:
       case Intrinsic::lifetime_end:
       case Intrinsic::objectsize:
       case Intrinsic::ptr_annotation:
       case Intrinsic::var_annotation:
         return true;
       }
 
   return false;
 }
 
 bool llvm::isValidAssumeForContext(const Instruction *Inv,
                                    const Instruction *CxtI,
                                    const DominatorTree *DT) {
   // There are two restrictions on the use of an assume:
   //  1. The assume must dominate the context (or the control flow must
   //     reach the assume whenever it reaches the context).
   //  2. The context must not be in the assume's set of ephemeral values
   //     (otherwise we will use the assume to prove that the condition
   //     feeding the assume is trivially true, thus causing the removal of
   //     the assume).
 
   if (DT) {
     if (DT->dominates(Inv, CxtI))
       return true;
   } else if (Inv->getParent() == CxtI->getParent()->getSinglePredecessor()) {
     // We don't have a DT, but this trivially dominates.
     return true;
   }
 
   // With or without a DT, the only remaining case we will check is if the
   // instructions are in the same BB.  Give up if that is not the case.
   if (Inv->getParent() != CxtI->getParent())
     return false;
 
   // If we have a dom tree, then we now know that the assume doens't dominate
   // the other instruction.  If we don't have a dom tree then we can check if
   // the assume is first in the BB.
   if (!DT) {
     // Search forward from the assume until we reach the context (or the end
     // of the block); the common case is that the assume will come first.
     for (auto I = std::next(BasicBlock::const_iterator(Inv)),
          IE = Inv->getParent()->end(); I != IE; ++I)
       if (&*I == CxtI)
         return true;
   }
 
   // The context comes first, but they're both in the same block. Make sure
   // there is nothing in between that might interrupt the control flow.
   for (BasicBlock::const_iterator I =
          std::next(BasicBlock::const_iterator(CxtI)), IE(Inv);
        I != IE; ++I)
     if (!isSafeToSpeculativelyExecute(&*I) && !isAssumeLikeIntrinsic(&*I))
       return false;
 
   return !isEphemeralValueOf(Inv, CxtI);
 }
 
 static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
                                        unsigned Depth, const Query &Q) {
   // Use of assumptions is context-sensitive. If we don't have a context, we
   // cannot use them!
   if (!Q.AC || !Q.CxtI)
     return;
 
   unsigned BitWidth = Known.getBitWidth();
 
   // Note that the patterns below need to be kept in sync with the code
   // in AssumptionCache::updateAffectedValues.
 
   for (auto &AssumeVH : Q.AC->assumptionsFor(V)) {
     if (!AssumeVH)
       continue;
     CallInst *I = cast<CallInst>(AssumeVH);
     assert(I->getParent()->getParent() == Q.CxtI->getParent()->getParent() &&
            "Got assumption for the wrong function!");
     if (Q.isExcluded(I))
       continue;
 
     // Warning: This loop can end up being somewhat performance sensetive.
     // We're running this loop for once for each value queried resulting in a
     // runtime of ~O(#assumes * #values).
 
     assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
            "must be an assume intrinsic");
 
     Value *Arg = I->getArgOperand(0);
 
     if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       assert(BitWidth == 1 && "assume operand is not i1?");
       Known.setAllOnes();
       return;
     }
     if (match(Arg, m_Not(m_Specific(V))) &&
         isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       assert(BitWidth == 1 && "assume operand is not i1?");
       Known.setAllZero();
       return;
     }
 
     // The remaining tests are all recursive, so bail out if we hit the limit.
     if (Depth == MaxDepth)
       continue;
 
     Value *A, *B;
     auto m_V = m_CombineOr(m_Specific(V),
                            m_CombineOr(m_PtrToInt(m_Specific(V)),
                            m_BitCast(m_Specific(V))));
 
     CmpInst::Predicate Pred;
     uint64_t C;
     // assume(v = a)
     if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) &&
         Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       Known.Zero |= RHSKnown.Zero;
       Known.One  |= RHSKnown.One;
     // assume(v & b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits MaskKnown(BitWidth);
       computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));
 
       // For those bits in the mask that are known to be one, we can propagate
       // known bits from the RHS to V.
       Known.Zero |= RHSKnown.Zero & MaskKnown.One;
       Known.One  |= RHSKnown.One  & MaskKnown.One;
     // assume(~(v & b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits MaskKnown(BitWidth);
       computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));
 
       // For those bits in the mask that are known to be one, we can propagate
       // inverted known bits from the RHS to V.
       Known.Zero |= RHSKnown.One  & MaskKnown.One;
       Known.One  |= RHSKnown.Zero & MaskKnown.One;
     // assume(v | b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits BKnown(BitWidth);
       computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate known
       // bits from the RHS to V.
       Known.Zero |= RHSKnown.Zero & BKnown.Zero;
       Known.One  |= RHSKnown.One  & BKnown.Zero;
     // assume(~(v | b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits BKnown(BitWidth);
       computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate
       // inverted known bits from the RHS to V.
       Known.Zero |= RHSKnown.One  & BKnown.Zero;
       Known.One  |= RHSKnown.Zero & BKnown.Zero;
     // assume(v ^ b = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits BKnown(BitWidth);
       computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate known
       // bits from the RHS to V. For those bits in B that are known to be one,
       // we can propagate inverted known bits from the RHS to V.
       Known.Zero |= RHSKnown.Zero & BKnown.Zero;
       Known.One  |= RHSKnown.One  & BKnown.Zero;
       Known.Zero |= RHSKnown.One  & BKnown.One;
       Known.One  |= RHSKnown.Zero & BKnown.One;
     // assume(~(v ^ b) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       KnownBits BKnown(BitWidth);
       computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
 
       // For those bits in B that are known to be zero, we can propagate
       // inverted known bits from the RHS to V. For those bits in B that are
       // known to be one, we can propagate known bits from the RHS to V.
       Known.Zero |= RHSKnown.One  & BKnown.Zero;
       Known.One  |= RHSKnown.Zero & BKnown.Zero;
       Known.Zero |= RHSKnown.Zero & BKnown.One;
       Known.One  |= RHSKnown.One  & BKnown.One;
     // assume(v << c = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT) &&
                C < BitWidth) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
       RHSKnown.Zero.lshrInPlace(C);
       Known.Zero |= RHSKnown.Zero;
       RHSKnown.One.lshrInPlace(C);
       Known.One  |= RHSKnown.One;
     // assume(~(v << c) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT) &&
                C < BitWidth) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them inverted
       // to known bits in V shifted to the right by C.
       RHSKnown.One.lshrInPlace(C);
       Known.Zero |= RHSKnown.One;
       RHSKnown.Zero.lshrInPlace(C);
       Known.One  |= RHSKnown.Zero;
     // assume(v >> c = a)
     } else if (match(Arg,
                      m_c_ICmp(Pred, m_Shr(m_V, m_ConstantInt(C)),
                               m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT) &&
                C < BitWidth) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them to known
       // bits in V shifted to the right by C.
       Known.Zero |= RHSKnown.Zero << C;
       Known.One  |= RHSKnown.One  << C;
     // assume(~(v >> c) = a)
     } else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shr(m_V, m_ConstantInt(C))),
                                    m_Value(A))) &&
                Pred == ICmpInst::ICMP_EQ &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT) &&
                C < BitWidth) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
       // For those bits in RHS that are known, we can propagate them inverted
       // to known bits in V shifted to the right by C.
       Known.Zero |= RHSKnown.One  << C;
       Known.One  |= RHSKnown.Zero << C;
     // assume(v >=_s c) where c is non-negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SGE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       if (RHSKnown.isNonNegative()) {
         // We know that the sign bit is zero.
         Known.makeNonNegative();
       }
     // assume(v >_s c) where c is at least -1.
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SGT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       if (RHSKnown.isAllOnes() || RHSKnown.isNonNegative()) {
         // We know that the sign bit is zero.
         Known.makeNonNegative();
       }
     // assume(v <=_s c) where c is negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SLE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       if (RHSKnown.isNegative()) {
         // We know that the sign bit is one.
         Known.makeNegative();
       }
     // assume(v <_s c) where c is non-positive
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_SLT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       if (RHSKnown.isZero() || RHSKnown.isNegative()) {
         // We know that the sign bit is one.
         Known.makeNegative();
       }
     // assume(v <=_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULE &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero.
       Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
       // assume(v <_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULT &&
                isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
       KnownBits RHSKnown(BitWidth);
       computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
       if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
         Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
       else
         Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
     }
   }
 
   // If assumptions conflict with each other or previous known bits, then we
   // have a logical fallacy. It's possible that the assumption is not reachable,
   // so this isn't a real bug. On the other hand, the program may have undefined
   // behavior, or we might have a bug in the compiler. We can't assert/crash, so
   // clear out the known bits, try to warn the user, and hope for the best.
   if (Known.Zero.intersects(Known.One)) {
     Known.resetAll();
 
     if (Q.ORE)
       Q.ORE->emit([&]() {
         auto *CxtI = const_cast<Instruction *>(Q.CxtI);
         return OptimizationRemarkAnalysis("value-tracking", "BadAssumption",
                                           CxtI)
                << "Detected conflicting code assumptions. Program may "
                   "have undefined behavior, or compiler may have "
                   "internal error.";
       });
   }
 }
 
 /// Compute known bits from a shift operator, including those with a
 /// non-constant shift amount. Known is the output of this function. Known2 is a
 /// pre-allocated temporary with the same bit width as Known. KZF and KOF are
 /// operator-specific functors that, given the known-zero or known-one bits
 /// respectively, and a shift amount, compute the implied known-zero or
 /// known-one bits of the shift operator's result respectively for that shift
 /// amount. The results from calling KZF and KOF are conservatively combined for
 /// all permitted shift amounts.
 static void computeKnownBitsFromShiftOperator(
     const Operator *I, KnownBits &Known, KnownBits &Known2,
     unsigned Depth, const Query &Q,
     function_ref<APInt(const APInt &, unsigned)> KZF,
     function_ref<APInt(const APInt &, unsigned)> KOF) {
   unsigned BitWidth = Known.getBitWidth();
 
   if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
     unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1);
 
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     Known.Zero = KZF(Known.Zero, ShiftAmt);
     Known.One  = KOF(Known.One, ShiftAmt);
     // If the known bits conflict, this must be an overflowing left shift, so
     // the shift result is poison. We can return anything we want. Choose 0 for
     // the best folding opportunity.
     if (Known.hasConflict())
       Known.setAllZero();
 
     return;
   }
 
   computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
 
   // If the shift amount could be greater than or equal to the bit-width of the
   // LHS, the value could be poison, but bail out because the check below is
   // expensive. TODO: Should we just carry on?
   if ((~Known.Zero).uge(BitWidth)) {
     Known.resetAll();
     return;
   }
 
   // Note: We cannot use Known.Zero.getLimitedValue() here, because if
   // BitWidth > 64 and any upper bits are known, we'll end up returning the
   // limit value (which implies all bits are known).
   uint64_t ShiftAmtKZ = Known.Zero.zextOrTrunc(64).getZExtValue();
   uint64_t ShiftAmtKO = Known.One.zextOrTrunc(64).getZExtValue();
 
   // It would be more-clearly correct to use the two temporaries for this
   // calculation. Reusing the APInts here to prevent unnecessary allocations.
   Known.resetAll();
 
   // If we know the shifter operand is nonzero, we can sometimes infer more
   // known bits. However this is expensive to compute, so be lazy about it and
   // only compute it when absolutely necessary.
   Optional<bool> ShifterOperandIsNonZero;
 
   // Early exit if we can't constrain any well-defined shift amount.
   if (!(ShiftAmtKZ & (PowerOf2Ceil(BitWidth) - 1)) &&
       !(ShiftAmtKO & (PowerOf2Ceil(BitWidth) - 1))) {
     ShifterOperandIsNonZero = isKnownNonZero(I->getOperand(1), Depth + 1, Q);
     if (!*ShifterOperandIsNonZero)
       return;
   }
 
   computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
   Known.Zero.setAllBits();
   Known.One.setAllBits();
   for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) {
     // Combine the shifted known input bits only for those shift amounts
     // compatible with its known constraints.
     if ((ShiftAmt & ~ShiftAmtKZ) != ShiftAmt)
       continue;
     if ((ShiftAmt | ShiftAmtKO) != ShiftAmt)
       continue;
     // If we know the shifter is nonzero, we may be able to infer more known
     // bits. This check is sunk down as far as possible to avoid the expensive
     // call to isKnownNonZero if the cheaper checks above fail.
     if (ShiftAmt == 0) {
       if (!ShifterOperandIsNonZero.hasValue())
         ShifterOperandIsNonZero =
             isKnownNonZero(I->getOperand(1), Depth + 1, Q);
       if (*ShifterOperandIsNonZero)
         continue;
     }
 
     Known.Zero &= KZF(Known2.Zero, ShiftAmt);
     Known.One  &= KOF(Known2.One, ShiftAmt);
   }
 
   // If the known bits conflict, the result is poison. Return a 0 and hope the
   // caller can further optimize that.
   if (Known.hasConflict())
     Known.setAllZero();
 }
 
 static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
                                          unsigned Depth, const Query &Q) {
   unsigned BitWidth = Known.getBitWidth();
 
   KnownBits Known2(Known);
   switch (I->getOpcode()) {
   default: break;
   case Instruction::Load:
     if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
       computeKnownBitsFromRangeMetadata(*MD, Known);
     break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
     computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-1 bits are only known if set in both the LHS & RHS.
     Known.One &= Known2.One;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
     Known.Zero |= Known2.Zero;
 
     // and(x, add (x, -1)) is a common idiom that always clears the low bit;
     // here we handle the more general case of adding any odd number by
     // matching the form add(x, add(x, y)) where y is odd.
     // TODO: This could be generalized to clearing any bit set in y where the
     // following bit is known to be unset in y.
     Value *Y = nullptr;
     if (!Known.Zero[0] && !Known.One[0] &&
         (match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
                                        m_Value(Y))) ||
          match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
                                        m_Value(Y))))) {
       Known2.resetAll();
       computeKnownBits(Y, Known2, Depth + 1, Q);
       if (Known2.countMinTrailingOnes() > 0)
         Known.Zero.setBit(0);
     }
     break;
   }
   case Instruction::Or:
     computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     Known.Zero &= Known2.Zero;
     // Output known-1 are known to be set if set in either the LHS | RHS.
     Known.One |= Known2.One;
     break;
   case Instruction::Xor: {
     computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
     // Output known-1 are known to be set if set in only one of the LHS, RHS.
     Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
     Known.Zero = std::move(KnownZeroOut);
     break;
   }
   case Instruction::Mul: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known,
                         Known2, Depth, Q);
     break;
   }
   case Instruction::UDiv: {
     // For the purposes of computing leading zeros we can conservatively
     // treat a udiv as a logical right shift by the power of 2 known to
     // be less than the denominator.
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
     unsigned LeadZ = Known2.countMinLeadingZeros();
 
     Known2.resetAll();
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
     unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
     if (RHSMaxLeadingZeros != BitWidth)
       LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);
 
     Known.Zero.setHighBits(LeadZ);
     break;
   }
   case Instruction::Select: {
     const Value *LHS, *RHS;
     SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
     if (SelectPatternResult::isMinOrMax(SPF)) {
       computeKnownBits(RHS, Known, Depth + 1, Q);
       computeKnownBits(LHS, Known2, Depth + 1, Q);
     } else {
       computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
       computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
     }
 
     unsigned MaxHighOnes = 0;
     unsigned MaxHighZeros = 0;
     if (SPF == SPF_SMAX) {
       // If both sides are negative, the result is negative.
       if (Known.isNegative() && Known2.isNegative())
         // We can derive a lower bound on the result by taking the max of the
         // leading one bits.
         MaxHighOnes =
             std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
       // If either side is non-negative, the result is non-negative.
       else if (Known.isNonNegative() || Known2.isNonNegative())
         MaxHighZeros = 1;
     } else if (SPF == SPF_SMIN) {
       // If both sides are non-negative, the result is non-negative.
       if (Known.isNonNegative() && Known2.isNonNegative())
         // We can derive an upper bound on the result by taking the max of the
         // leading zero bits.
         MaxHighZeros = std::max(Known.countMinLeadingZeros(),
                                 Known2.countMinLeadingZeros());
       // If either side is negative, the result is negative.
       else if (Known.isNegative() || Known2.isNegative())
         MaxHighOnes = 1;
     } else if (SPF == SPF_UMAX) {
       // We can derive a lower bound on the result by taking the max of the
       // leading one bits.
       MaxHighOnes =
           std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
     } else if (SPF == SPF_UMIN) {
       // We can derive an upper bound on the result by taking the max of the
       // leading zero bits.
       MaxHighZeros =
           std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     }
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
     Known.Zero &= Known2.Zero;
     if (MaxHighOnes > 0)
       Known.One.setHighBits(MaxHighOnes);
     if (MaxHighZeros > 0)
       Known.Zero.setHighBits(MaxHighZeros);
     break;
   }
   case Instruction::FPTrunc:
   case Instruction::FPExt:
   case Instruction::FPToUI:
   case Instruction::FPToSI:
   case Instruction::SIToFP:
   case Instruction::UIToFP:
     break; // Can't work with floating point.
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
     // Fall through and handle them the same as zext/trunc.
     LLVM_FALLTHROUGH;
   case Instruction::ZExt:
   case Instruction::Trunc: {
     Type *SrcTy = I->getOperand(0)->getType();
 
     unsigned SrcBitWidth;
     // Note that we handle pointer operands here because of inttoptr/ptrtoint
     // which fall through here.
     SrcBitWidth = Q.DL.getTypeSizeInBits(SrcTy->getScalarType());
 
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
     Known = Known.zextOrTrunc(SrcBitWidth);
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     Known = Known.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
     if (BitWidth > SrcBitWidth)
       Known.Zero.setBitsFrom(SrcBitWidth);
     break;
   }
   case Instruction::BitCast: {
     Type *SrcTy = I->getOperand(0)->getType();
     if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
         !I->getType()->isVectorTy()) {
       computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
       break;
     }
     break;
   }
   case Instruction::SExt: {
     // Compute the bits in the result that are not present in the input.
     unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
 
     Known = Known.trunc(SrcBitWidth);
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
     Known = Known.sext(BitWidth);
     break;
   }
   case Instruction::Shl: {
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     auto KZF = [NSW](const APInt &KnownZero, unsigned ShiftAmt) {
       APInt KZResult = KnownZero << ShiftAmt;
       KZResult.setLowBits(ShiftAmt); // Low bits known 0.
       // If this shift has "nsw" keyword, then the result is either a poison
       // value or has the same sign bit as the first operand.
       if (NSW && KnownZero.isSignBitSet())
         KZResult.setSignBit();
       return KZResult;
     };
 
     auto KOF = [NSW](const APInt &KnownOne, unsigned ShiftAmt) {
       APInt KOResult = KnownOne << ShiftAmt;
       if (NSW && KnownOne.isSignBitSet())
         KOResult.setSignBit();
       return KOResult;
     };
 
     computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::LShr: {
     // (lshr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
     auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
       APInt KZResult = KnownZero.lshr(ShiftAmt);
       // High bits known zero.
       KZResult.setHighBits(ShiftAmt);
       return KZResult;
     };
 
     auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
       return KnownOne.lshr(ShiftAmt);
     };
 
     computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::AShr: {
     // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
     auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
       return KnownZero.ashr(ShiftAmt);
     };
 
     auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
       return KnownOne.ashr(ShiftAmt);
     };
 
     computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
     break;
   }
   case Instruction::Sub: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
                            Known, Known2, Depth, Q);
     break;
   }
   case Instruction::Add: {
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
                            Known, Known2, Depth, Q);
     break;
   }
   case Instruction::SRem:
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
       APInt RA = Rem->getValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
 
         // The low bits of the first operand are unchanged by the srem.
         Known.Zero = Known2.Zero & LowBits;
         Known.One = Known2.One & LowBits;
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
         if (Known2.isNonNegative() || LowBits.isSubsetOf(Known2.Zero))
           Known.Zero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
         if (Known2.isNegative() && LowBits.intersects(Known2.One))
           Known.One |= ~LowBits;
 
         assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
         break;
       }
     }
 
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero.
     computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
     // If it's known zero, our sign bit is also zero.
     if (Known2.isNonNegative())
       Known.makeNonNegative();
 
     break;
   case Instruction::URem: {
     if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
       const APInt &RA = Rem->getValue();
       if (RA.isPowerOf2()) {
         APInt LowBits = (RA - 1);
         computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
         Known.Zero |= ~LowBits;
         Known.One &= LowBits;
         break;
       }
     }
 
     // Since the result is less than or equal to either operand, any leading
     // zero bits in either operand must also exist in the result.
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
 
     unsigned Leaders =
         std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
     Known.resetAll();
     Known.Zero.setHighBits(Leaders);
     break;
   }
 
   case Instruction::Alloca: {
     const AllocaInst *AI = cast<AllocaInst>(I);
     unsigned Align = AI->getAlignment();
     if (Align == 0)
       Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());
 
     if (Align > 0)
       Known.Zero.setLowBits(countTrailingZeros(Align));
     break;
   }
   case Instruction::GetElementPtr: {
     // Analyze all of the subscripts of this getelementptr instruction
     // to determine if we can prove known low zero bits.
     KnownBits LocalKnown(BitWidth);
     computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
     unsigned TrailZ = LocalKnown.countMinTrailingZeros();
 
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
       Value *Index = I->getOperand(i);
       if (StructType *STy = GTI.getStructTypeOrNull()) {
         // Handle struct member offset arithmetic.
 
         // Handle case when index is vector zeroinitializer
         Constant *CIndex = cast<Constant>(Index);
         if (CIndex->isZeroValue())
           continue;
 
         if (CIndex->getType()->isVectorTy())
           Index = CIndex->getSplatValue();
 
         unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
         const StructLayout *SL = Q.DL.getStructLayout(STy);
         uint64_t Offset = SL->getElementOffset(Idx);
         TrailZ = std::min<unsigned>(TrailZ,
                                     countTrailingZeros(Offset));
       } else {
         // Handle array index arithmetic.
         Type *IndexedTy = GTI.getIndexedType();
         if (!IndexedTy->isSized()) {
           TrailZ = 0;
           break;
         }
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
         uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy);
         LocalKnown.Zero = LocalKnown.One = APInt(GEPOpiBits, 0);
         computeKnownBits(Index, LocalKnown, Depth + 1, Q);
         TrailZ = std::min(TrailZ,
                           unsigned(countTrailingZeros(TypeSize) +
                                    LocalKnown.countMinTrailingZeros()));
       }
     }
 
     Known.Zero.setLowBits(TrailZ);
     break;
   }
   case Instruction::PHI: {
     const PHINode *P = cast<PHINode>(I);
     // Handle the case of a simple two-predecessor recurrence PHI.
     // There's a lot more that could theoretically be done here, but
     // this is sufficient to catch some interesting cases.
     if (P->getNumIncomingValues() == 2) {
       for (unsigned i = 0; i != 2; ++i) {
         Value *L = P->getIncomingValue(i);
         Value *R = P->getIncomingValue(!i);
         Operator *LU = dyn_cast<Operator>(L);
         if (!LU)
           continue;
         unsigned Opcode = LU->getOpcode();
         // Check for operations that have the property that if
         // both their operands have low zero bits, the result
         // will have low zero bits.
         if (Opcode == Instruction::Add ||
             Opcode == Instruction::Sub ||
             Opcode == Instruction::And ||
             Opcode == Instruction::Or ||
             Opcode == Instruction::Mul) {
           Value *LL = LU->getOperand(0);
           Value *LR = LU->getOperand(1);
           // Find a recurrence.
           if (LL == I)
             L = LR;
           else if (LR == I)
             L = LL;
           else
             break;
           // Ok, we have a PHI of the form L op= R. Check for low
           // zero bits.
           computeKnownBits(R, Known2, Depth + 1, Q);
 
           // We need to take the minimum number of known bits
           KnownBits Known3(Known);
           computeKnownBits(L, Known3, Depth + 1, Q);
 
           Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
                                          Known3.countMinTrailingZeros()));
 
           auto *OverflowOp = dyn_cast<OverflowingBinaryOperator>(LU);
           if (OverflowOp && OverflowOp->hasNoSignedWrap()) {
             // If initial value of recurrence is nonnegative, and we are adding
             // a nonnegative number with nsw, the result can only be nonnegative
             // or poison value regardless of the number of times we execute the
             // add in phi recurrence. If initial value is negative and we are
             // adding a negative number with nsw, the result can only be
             // negative or poison value. Similar arguments apply to sub and mul.
             //
             // (add non-negative, non-negative) --> non-negative
             // (add negative, negative) --> negative
             if (Opcode == Instruction::Add) {
               if (Known2.isNonNegative() && Known3.isNonNegative())
                 Known.makeNonNegative();
               else if (Known2.isNegative() && Known3.isNegative())
                 Known.makeNegative();
             }
 
             // (sub nsw non-negative, negative) --> non-negative
             // (sub nsw negative, non-negative) --> negative
             else if (Opcode == Instruction::Sub && LL == I) {
               if (Known2.isNonNegative() && Known3.isNegative())
                 Known.makeNonNegative();
               else if (Known2.isNegative() && Known3.isNonNegative())
                 Known.makeNegative();
             }
 
             // (mul nsw non-negative, non-negative) --> non-negative
             else if (Opcode == Instruction::Mul && Known2.isNonNegative() &&
                      Known3.isNonNegative())
               Known.makeNonNegative();
           }
 
           break;
         }
       }
     }
 
     // Unreachable blocks may have zero-operand PHI nodes.
     if (P->getNumIncomingValues() == 0)
       break;
 
     // Otherwise take the unions of the known bit sets of the operands,
     // taking conservative care to avoid excessive recursion.
     if (Depth < MaxDepth - 1 && !Known.Zero && !Known.One) {
       // Skip if every incoming value references to ourself.
       if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
         break;
 
       Known.Zero.setAllBits();
       Known.One.setAllBits();
       for (Value *IncValue : P->incoming_values()) {
         // Skip direct self references.
         if (IncValue == P) continue;
 
         Known2 = KnownBits(BitWidth);
         // Recurse, but cap the recursion to one level, because we don't
         // want to waste time spinning around in loops.
         computeKnownBits(IncValue, Known2, MaxDepth - 1, Q);
         Known.Zero &= Known2.Zero;
         Known.One &= Known2.One;
         // If all bits have been ruled out, there's no need to check
         // more operands.
         if (!Known.Zero && !Known.One)
           break;
       }
     }
     break;
   }
   case Instruction::Call:
   case Instruction::Invoke:
     // If range metadata is attached to this call, set known bits from that,
     // and then intersect with known bits based on other properties of the
     // function.
     if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range))
       computeKnownBitsFromRangeMetadata(*MD, Known);
     if (const Value *RV = ImmutableCallSite(I).getReturnedArgOperand()) {
       computeKnownBits(RV, Known2, Depth + 1, Q);
       Known.Zero |= Known2.Zero;
       Known.One |= Known2.One;
     }
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::bitreverse:
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         Known.Zero |= Known2.Zero.reverseBits();
         Known.One |= Known2.One.reverseBits();
         break;
       case Intrinsic::bswap:
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         Known.Zero |= Known2.Zero.byteSwap();
         Known.One |= Known2.One.byteSwap();
         break;
       case Intrinsic::ctlz: {
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
         unsigned PossibleLZ = Known2.One.countLeadingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
         unsigned LowBits = Log2_32(PossibleLZ)+1;
         Known.Zero.setBitsFrom(LowBits);
         break;
       }
       case Intrinsic::cttz: {
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
         unsigned PossibleTZ = Known2.One.countTrailingZeros();
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
         unsigned LowBits = Log2_32(PossibleTZ)+1;
         Known.Zero.setBitsFrom(LowBits);
         break;
       }
       case Intrinsic::ctpop: {
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // We can bound the space the count needs.  Also, bits known to be zero
         // can't contribute to the population.
         unsigned BitsPossiblySet = Known2.countMaxPopulation();
         unsigned LowBits = Log2_32(BitsPossiblySet)+1;
         Known.Zero.setBitsFrom(LowBits);
         // TODO: we could bound KnownOne using the lower bound on the number
         // of bits which might be set provided by popcnt KnownOne2.
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
         Known.Zero.setBitsFrom(32);
         break;
       }
     }
     break;
   case Instruction::ExtractElement:
     // Look through extract element. At the moment we keep this simple and skip
     // tracking the specific element. But at least we might find information
     // valid for all elements of the vector (for example if vector is sign
     // extended, shifted, etc).
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
     break;
   case Instruction::ExtractValue:
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
       const ExtractValueInst *EVI = cast<ExtractValueInst>(I);
       if (EVI->getNumIndices() != 1) break;
       if (EVI->getIndices()[0] == 0) {
         switch (II->getIntrinsicID()) {
         default: break;
         case Intrinsic::uadd_with_overflow:
         case Intrinsic::sadd_with_overflow:
           computeKnownBitsAddSub(true, II->getArgOperand(0),
                                  II->getArgOperand(1), false, Known, Known2,
                                  Depth, Q);
           break;
         case Intrinsic::usub_with_overflow:
         case Intrinsic::ssub_with_overflow:
           computeKnownBitsAddSub(false, II->getArgOperand(0),
                                  II->getArgOperand(1), false, Known, Known2,
                                  Depth, Q);
           break;
         case Intrinsic::umul_with_overflow:
         case Intrinsic::smul_with_overflow:
           computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false,
                               Known, Known2, Depth, Q);
           break;
         }
       }
     }
   }
 }
 
 /// Determine which bits of V are known to be either zero or one and return
 /// them.
 KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
   KnownBits Known(getBitWidth(V->getType(), Q.DL));
   computeKnownBits(V, Known, Depth, Q);
   return Known;
 }
 
 /// Determine which bits of V are known to be either zero or one and return
 /// them in the Known bit set.
 ///
 /// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
 /// we cannot optimize based on the assumption that it is zero without changing
 /// it to be an explicit zero.  If we don't change it to zero, other code could
 /// optimized based on the contradictory assumption that it is non-zero.
 /// Because instcombine aggressively folds operations with undef args anyway,
 /// this won't lose us code quality.
 ///
 /// This function is defined on values with integer type, values with pointer
 /// type, and vectors of integers.  In the case
 /// where V is a vector, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
 void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
                       const Query &Q) {
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   unsigned BitWidth = Known.getBitWidth();
 
   assert((V->getType()->isIntOrIntVectorTy(BitWidth) ||
           V->getType()->isPtrOrPtrVectorTy()) &&
          "Not integer or pointer type!");
   assert(Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth &&
          "V and Known should have same BitWidth");
   (void)BitWidth;
 
   const APInt *C;
   if (match(V, m_APInt(C))) {
     // We know all of the bits for a scalar constant or a splat vector constant!
     Known.One = *C;
     Known.Zero = ~Known.One;
     return;
   }
   // Null and aggregate-zero are all-zeros.
   if (isa<ConstantPointerNull>(V) || isa<ConstantAggregateZero>(V)) {
     Known.setAllZero();
     return;
   }
   // Handle a constant vector by taking the intersection of the known bits of
   // each element.
   if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
     // We know that CDS must be a vector of integers. Take the intersection of
     // each element.
     Known.Zero.setAllBits(); Known.One.setAllBits();
     for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
       APInt Elt = CDS->getElementAsAPInt(i);
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
     return;
   }
 
   if (const auto *CV = dyn_cast<ConstantVector>(V)) {
     // We know that CV must be a vector of integers. Take the intersection of
     // each element.
     Known.Zero.setAllBits(); Known.One.setAllBits();
     for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
       Constant *Element = CV->getAggregateElement(i);
       auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
       if (!ElementCI) {
         Known.resetAll();
         return;
       }
       const APInt &Elt = ElementCI->getValue();
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
     return;
   }
 
   // Start out not knowing anything.
   Known.resetAll();
 
   // We can't imply anything about undefs.
   if (isa<UndefValue>(V))
     return;
 
   // There's no point in looking through other users of ConstantData for
   // assumptions.  Confirm that we've handled them all.
   assert(!isa<ConstantData>(V) && "Unhandled constant data!");
 
   // Limit search depth.
   // All recursive calls that increase depth must come after this.
   if (Depth == MaxDepth)
     return;
 
   // A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
   // the bits of its aliasee.
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
     if (!GA->isInterposable())
       computeKnownBits(GA->getAliasee(), Known, Depth + 1, Q);
     return;
   }
 
   if (const Operator *I = dyn_cast<Operator>(V))
     computeKnownBitsFromOperator(I, Known, Depth, Q);
 
   // Aligned pointers have trailing zeros - refine Known.Zero set
   if (V->getType()->isPointerTy()) {
     unsigned Align = V->getPointerAlignment(Q.DL);
     if (Align)
       Known.Zero.setLowBits(countTrailingZeros(Align));
   }
 
   // computeKnownBitsFromAssume strictly refines Known.
   // Therefore, we run them after computeKnownBitsFromOperator.
 
   // Check whether a nearby assume intrinsic can determine some known bits.
   computeKnownBitsFromAssume(V, Known, Depth, Q);
 
   assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
 }
 
 /// Return true if the given value is known to have exactly one
 /// bit set when defined. For vectors return true if every element is known to
 /// be a power of two when defined. Supports values with integer or pointer
 /// types and vectors of integers.
 bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                             const Query &Q) {
   assert(Depth <= MaxDepth && "Limit Search Depth");
 
   if (const Constant *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return OrZero;
 
     const APInt *ConstIntOrConstSplatInt;
     if (match(C, m_APInt(ConstIntOrConstSplatInt)))
       return ConstIntOrConstSplatInt->isPowerOf2();
   }
 
   // 1 << X is clearly a power of two if the one is not shifted off the end.  If
   // it is shifted off the end then the result is undefined.
   if (match(V, m_Shl(m_One(), m_Value())))
     return true;
 
   // (signmask) >>l X is clearly a power of two if the one is not shifted off
   // the bottom.  If it is shifted off the bottom then the result is undefined.
   if (match(V, m_LShr(m_SignMask(), m_Value())))
     return true;
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
   if (Depth++ == MaxDepth)
     return false;
 
   Value *X = nullptr, *Y = nullptr;
   // A shift left or a logical shift right of a power of two is a power of two
   // or zero.
   if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
                  match(V, m_LShr(m_Value(X), m_Value()))))
     return isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q);
 
   if (const ZExtInst *ZI = dyn_cast<ZExtInst>(V))
     return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth, Q);
 
   if (const SelectInst *SI = dyn_cast<SelectInst>(V))
     return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth, Q) &&
            isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth, Q);
 
   if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) {
     // A power of two and'd with anything is a power of two or zero.
     if (isKnownToBeAPowerOfTwo(X, /*OrZero*/ true, Depth, Q) ||
         isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, Depth, Q))
       return true;
     // X & (-X) is always a power of two or zero.
     if (match(X, m_Neg(m_Specific(Y))) || match(Y, m_Neg(m_Specific(X))))
       return true;
     return false;
   }
 
   // Adding a power-of-two or zero to the same power-of-two or zero yields
   // either the original power-of-two, a larger power-of-two or zero.
   if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
     const OverflowingBinaryOperator *VOBO = cast<OverflowingBinaryOperator>(V);
     if (OrZero || VOBO->hasNoUnsignedWrap() || VOBO->hasNoSignedWrap()) {
       if (match(X, m_And(m_Specific(Y), m_Value())) ||
           match(X, m_And(m_Value(), m_Specific(Y))))
         if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth, Q))
           return true;
       if (match(Y, m_And(m_Specific(X), m_Value())) ||
           match(Y, m_And(m_Value(), m_Specific(X))))
         if (isKnownToBeAPowerOfTwo(X, OrZero, Depth, Q))
           return true;
 
       unsigned BitWidth = V->getType()->getScalarSizeInBits();
       KnownBits LHSBits(BitWidth);
       computeKnownBits(X, LHSBits, Depth, Q);
 
       KnownBits RHSBits(BitWidth);
       computeKnownBits(Y, RHSBits, Depth, Q);
       // If i8 V is a power of two or zero:
       //  ZeroBits: 1 1 1 0 1 1 1 1
       // ~ZeroBits: 0 0 0 1 0 0 0 0
       if ((~(LHSBits.Zero & RHSBits.Zero)).isPowerOf2())
         // If OrZero isn't set, we cannot give back a zero result.
         // Make sure either the LHS or RHS has a bit set.
         if (OrZero || RHSBits.One.getBoolValue() || LHSBits.One.getBoolValue())
           return true;
     }
   }
 
   // An exact divide or right shift can only shift off zero bits, so the result
   // is a power of two only if the first operand is a power of two and not
   // copying a sign bit (sdiv int_min, 2).
   if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) ||
       match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) {
     return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero,
                                   Depth, Q);
   }
 
   return false;
 }
 
 /// \brief Test whether a GEP's result is known to be non-null.
 ///
 /// Uses properties inherent in a GEP to try to determine whether it is known
 /// to be non-null.
 ///
 /// Currently this routine does not support vector GEPs.
 static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
                               const Query &Q) {
   if (!GEP->isInBounds() || GEP->getPointerAddressSpace() != 0)
     return false;
 
   // FIXME: Support vector-GEPs.
   assert(GEP->getType()->isPointerTy() && "We only support plain pointer GEP");
 
   // If the base pointer is non-null, we cannot walk to a null address with an
   // inbounds GEP in address space zero.
   if (isKnownNonZero(GEP->getPointerOperand(), Depth, Q))
     return true;
 
   // Walk the GEP operands and see if any operand introduces a non-zero offset.
   // If so, then the GEP cannot produce a null pointer, as doing so would
   // inherently violate the inbounds contract within address space zero.
   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
        GTI != GTE; ++GTI) {
     // Struct types are easy -- they must always be indexed by a constant.
     if (StructType *STy = GTI.getStructTypeOrNull()) {
       ConstantInt *OpC = cast<ConstantInt>(GTI.getOperand());
       unsigned ElementIdx = OpC->getZExtValue();
       const StructLayout *SL = Q.DL.getStructLayout(STy);
       uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
       if (ElementOffset > 0)
         return true;
       continue;
     }
 
     // If we have a zero-sized type, the index doesn't matter. Keep looping.
     if (Q.DL.getTypeAllocSize(GTI.getIndexedType()) == 0)
       continue;
 
     // Fast path the constant operand case both for efficiency and so we don't
     // increment Depth when just zipping down an all-constant GEP.
     if (ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand())) {
       if (!OpC->isZero())
         return true;
       continue;
     }
 
     // We post-increment Depth here because while isKnownNonZero increments it
     // as well, when we pop back up that increment won't persist. We don't want
     // to recurse 10k times just because we have 10k GEP operands. We don't
     // bail completely out because we want to handle constant GEPs regardless
     // of depth.
     if (Depth++ >= MaxDepth)
       continue;
 
     if (isKnownNonZero(GTI.getOperand(), Depth, Q))
       return true;
   }
 
   return false;
 }
 
 static bool isKnownNonNullFromDominatingCondition(const Value *V,
                                                   const Instruction *CtxI,
                                                   const DominatorTree *DT) {
   assert(V->getType()->isPointerTy() && "V must be pointer type");
   assert(!isa<ConstantData>(V) && "Did not expect ConstantPointerNull");
 
   if (!CtxI || !DT)
     return false;
 
   unsigned NumUsesExplored = 0;
   for (auto *U : V->users()) {
     // Avoid massive lists
     if (NumUsesExplored >= DomConditionsMaxUses)
       break;
     NumUsesExplored++;
 
     // If the value is used as an argument to a call or invoke, then argument
     // attributes may provide an answer about null-ness.
     if (auto CS = ImmutableCallSite(U))
       if (auto *CalledFunc = CS.getCalledFunction())
         for (const Argument &Arg : CalledFunc->args())
           if (CS.getArgOperand(Arg.getArgNo()) == V &&
               Arg.hasNonNullAttr() && DT->dominates(CS.getInstruction(), CtxI))
             return true;
 
     // Consider only compare instructions uniquely controlling a branch
     CmpInst::Predicate Pred;
     if (!match(const_cast<User *>(U),
                m_c_ICmp(Pred, m_Specific(V), m_Zero())) ||
         (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE))
       continue;
 
     for (auto *CmpU : U->users()) {
       if (const BranchInst *BI = dyn_cast<BranchInst>(CmpU)) {
         assert(BI->isConditional() && "uses a comparison!");
 
         BasicBlock *NonNullSuccessor =
             BI->getSuccessor(Pred == ICmpInst::ICMP_EQ ? 1 : 0);
         BasicBlockEdge Edge(BI->getParent(), NonNullSuccessor);
         if (Edge.isSingleEdge() && DT->dominates(Edge, CtxI->getParent()))
           return true;
       } else if (Pred == ICmpInst::ICMP_NE &&
                  match(CmpU, m_Intrinsic<Intrinsic::experimental_guard>()) &&
                  DT->dominates(cast<Instruction>(CmpU), CtxI)) {
         return true;
       }
     }
   }
 
   return false;
 }
 
 /// Does the 'Range' metadata (which must be a valid MD_range operand list)
 /// ensure that the value it's attached to is never Value?  'RangeType' is
 /// is the type of the value described by the range.
 static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value) {
   const unsigned NumRanges = Ranges->getNumOperands() / 2;
   assert(NumRanges >= 1);
   for (unsigned i = 0; i < NumRanges; ++i) {
     ConstantInt *Lower =
         mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 0));
     ConstantInt *Upper =
         mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 1));
     ConstantRange Range(Lower->getValue(), Upper->getValue());
     if (Range.contains(Value))
       return false;
   }
   return true;
 }
 
 /// Return true if the given value is known to be non-zero when defined. For
 /// vectors, return true if every element is known to be non-zero when
 /// defined. For pointers, if the context instruction and dominator tree are
 /// specified, perform context-sensitive analysis and return true if the
 /// pointer couldn't possibly be null at the specified instruction.
 /// Supports values with integer or pointer type and vectors of integers.
 bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   if (auto *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return false;
     if (isa<ConstantInt>(C))
       // Must be non-zero due to null test above.
       return true;
 
     // For constant vectors, check that all elements are undefined or known
     // non-zero to determine that the whole vector is known non-zero.
     if (auto *VecTy = dyn_cast<VectorType>(C->getType())) {
       for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) {
         Constant *Elt = C->getAggregateElement(i);
         if (!Elt || Elt->isNullValue())
           return false;
         if (!isa<UndefValue>(Elt) && !isa<ConstantInt>(Elt))
           return false;
       }
       return true;
     }
 
     // A global variable in address space 0 is non null unless extern weak
     // or an absolute symbol reference. Other address spaces may have null as a
     // valid address for a global, so we can't assume anything.
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
       if (!GV->isAbsoluteSymbolRef() && !GV->hasExternalWeakLinkage() &&
           GV->getType()->getAddressSpace() == 0)
         return true;
     } else
       return false;
   }
 
   if (auto *I = dyn_cast<Instruction>(V)) {
     if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range)) {
       // If the possible ranges don't contain zero, then the value is
       // definitely non-zero.
       if (auto *Ty = dyn_cast<IntegerType>(V->getType())) {
         const APInt ZeroValue(Ty->getBitWidth(), 0);
         if (rangeMetadataExcludesValue(Ranges, ZeroValue))
           return true;
       }
     }
   }
 
   // Check for pointer simplifications.
   if (V->getType()->isPointerTy()) {
     // Alloca never returns null, malloc might.
     if (isa<AllocaInst>(V) && Q.DL.getAllocaAddrSpace() == 0)
       return true;
 
     // A byval, inalloca, or nonnull argument is never null.
     if (const Argument *A = dyn_cast<Argument>(V))
       if (A->hasByValOrInAllocaAttr() || A->hasNonNullAttr())
         return true;
 
     // A Load tagged with nonnull metadata is never null.
     if (const LoadInst *LI = dyn_cast<LoadInst>(V))
       if (LI->getMetadata(LLVMContext::MD_nonnull))
         return true;
 
     if (auto CS = ImmutableCallSite(V))
       if (CS.isReturnNonNull())
         return true;
   }
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
   if (Depth++ >= MaxDepth)
     return false;
 
   // Check for recursive pointer simplifications.
   if (V->getType()->isPointerTy()) {
     if (isKnownNonNullFromDominatingCondition(V, Q.CxtI, Q.DT))
       return true;
 
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V))
       if (isGEPKnownNonNull(GEP, Depth, Q))
         return true;
   }
 
   unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), Q.DL);
 
   // X | Y != 0 if X != 0 or Y != 0.
   Value *X = nullptr, *Y = nullptr;
   if (match(V, m_Or(m_Value(X), m_Value(Y))))
     return isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q);
 
   // ext X != 0 if X != 0.
   if (isa<SExtInst>(V) || isa<ZExtInst>(V))
     return isKnownNonZero(cast<Instruction>(V)->getOperand(0), Depth, Q);
 
   // shl X, Y != 0 if X is odd.  Note that the value of the shift is undefined
   // if the lowest bit is shifted off the end.
   if (match(V, m_Shl(m_Value(X), m_Value(Y)))) {
     // shl nuw can't remove any non-zero bits.
     const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
     if (BO->hasNoUnsignedWrap())
       return isKnownNonZero(X, Depth, Q);
 
     KnownBits Known(BitWidth);
     computeKnownBits(X, Known, Depth, Q);
     if (Known.One[0])
       return true;
   }
   // shr X, Y != 0 if X is negative.  Note that the value of the shift is not
   // defined if the sign bit is shifted off the end.
   else if (match(V, m_Shr(m_Value(X), m_Value(Y)))) {
     // shr exact can only shift out zero bits.
     const PossiblyExactOperator *BO = cast<PossiblyExactOperator>(V);
     if (BO->isExact())
       return isKnownNonZero(X, Depth, Q);
 
     KnownBits Known = computeKnownBits(X, Depth, Q);
     if (Known.isNegative())
       return true;
 
     // If the shifter operand is a constant, and all of the bits shifted
     // out are known to be zero, and X is known non-zero then at least one
     // non-zero bit must remain.
     if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
       auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
       // Is there a known one in the portion not shifted out?
       if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
         return true;
       // Are all the bits to be shifted out known zero?
       if (Known.countMinTrailingZeros() >= ShiftVal)
         return isKnownNonZero(X, Depth, Q);
     }
   }
   // div exact can only produce a zero if the dividend is zero.
   else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
     return isKnownNonZero(X, Depth, Q);
   }
   // X + Y.
   else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
     KnownBits XKnown = computeKnownBits(X, Depth, Q);
     KnownBits YKnown = computeKnownBits(Y, Depth, Q);
 
     // If X and Y are both non-negative (as signed values) then their sum is not
     // zero unless both X and Y are zero.
     if (XKnown.isNonNegative() && YKnown.isNonNegative())
       if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
         return true;
 
     // If X and Y are both negative (as signed values) then their sum is not
     // zero unless both X and Y equal INT_MIN.
     if (XKnown.isNegative() && YKnown.isNegative()) {
       APInt Mask = APInt::getSignedMaxValue(BitWidth);
       // The sign bit of X is set.  If some other bit is set then X is not equal
       // to INT_MIN.
       if (XKnown.One.intersects(Mask))
         return true;
       // The sign bit of Y is set.  If some other bit is set then Y is not equal
       // to INT_MIN.
       if (YKnown.One.intersects(Mask))
         return true;
     }
 
     // The sum of a non-negative number and a power of two is not zero.
     if (XKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q))
       return true;
     if (YKnown.isNonNegative() &&
         isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q))
       return true;
   }
   // X * Y.
   else if (match(V, m_Mul(m_Value(X), m_Value(Y)))) {
     const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
     // If X and Y are non-zero then so is X * Y as long as the multiplication
     // does not overflow.
     if ((BO->hasNoSignedWrap() || BO->hasNoUnsignedWrap()) &&
         isKnownNonZero(X, Depth, Q) && isKnownNonZero(Y, Depth, Q))
       return true;
   }
   // (C ? X : Y) != 0 if X != 0 and Y != 0.
   else if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
     if (isKnownNonZero(SI->getTrueValue(), Depth, Q) &&
         isKnownNonZero(SI->getFalseValue(), Depth, Q))
       return true;
   }
   // PHI
   else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
     // Try and detect a recurrence that monotonically increases from a
     // starting value, as these are common as induction variables.
     if (PN->getNumIncomingValues() == 2) {
       Value *Start = PN->getIncomingValue(0);
       Value *Induction = PN->getIncomingValue(1);
       if (isa<ConstantInt>(Induction) && !isa<ConstantInt>(Start))
         std::swap(Start, Induction);
       if (ConstantInt *C = dyn_cast<ConstantInt>(Start)) {
         if (!C->isZero() && !C->isNegative()) {
           ConstantInt *X;
           if ((match(Induction, m_NSWAdd(m_Specific(PN), m_ConstantInt(X))) ||
                match(Induction, m_NUWAdd(m_Specific(PN), m_ConstantInt(X)))) &&
               !X->isNegative())
             return true;
         }
       }
     }
     // Check if all incoming values are non-zero constant.
     bool AllNonZeroConstants = llvm::all_of(PN->operands(), [](Value *V) {
       return isa<ConstantInt>(V) && !cast<ConstantInt>(V)->isZero();
     });
     if (AllNonZeroConstants)
       return true;
   }
 
   KnownBits Known(BitWidth);
   computeKnownBits(V, Known, Depth, Q);
   return Known.One != 0;
 }
 
 /// Return true if V2 == V1 + X, where X is known non-zero.
 static bool isAddOfNonZero(const Value *V1, const Value *V2, const Query &Q) {
   const BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
   if (!BO || BO->getOpcode() != Instruction::Add)
     return false;
   Value *Op = nullptr;
   if (V2 == BO->getOperand(0))
     Op = BO->getOperand(1);
   else if (V2 == BO->getOperand(1))
     Op = BO->getOperand(0);
   else
     return false;
   return isKnownNonZero(Op, 0, Q);
 }
 
 /// Return true if it is known that V1 != V2.
 static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q) {
   if (V1 == V2)
     return false;
   if (V1->getType() != V2->getType())
     // We can't look through casts yet.
     return false;
   if (isAddOfNonZero(V1, V2, Q) || isAddOfNonZero(V2, V1, Q))
     return true;
 
   if (V1->getType()->isIntOrIntVectorTy()) {
     // Are any known bits in V1 contradictory to known bits in V2? If V1
     // has a known zero where V2 has a known one, they must not be equal.
     KnownBits Known1 = computeKnownBits(V1, 0, Q);
     KnownBits Known2 = computeKnownBits(V2, 0, Q);
 
     if (Known1.Zero.intersects(Known2.One) ||
         Known2.Zero.intersects(Known1.One))
       return true;
   }
   return false;
 }
 
 /// Return true if 'V & Mask' is known to be zero.  We use this predicate to
 /// simplify operations downstream. Mask is known to be zero for bits that V
 /// cannot have.
 ///
 /// This function is defined on values with integer type, values with pointer
 /// type, and vectors of integers.  In the case
 /// where V is a vector, the mask, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
 bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
                        const Query &Q) {
   KnownBits Known(Mask.getBitWidth());
   computeKnownBits(V, Known, Depth, Q);
   return Mask.isSubsetOf(Known.Zero);
 }
 
 /// For vector constants, loop over the elements and find the constant with the
 /// minimum number of sign bits. Return 0 if the value is not a vector constant
 /// or if any element was not analyzed; otherwise, return the count for the
 /// element with the minimum number of sign bits.
 static unsigned computeNumSignBitsVectorConstant(const Value *V,
                                                  unsigned TyBits) {
   const auto *CV = dyn_cast<Constant>(V);
   if (!CV || !CV->getType()->isVectorTy())
     return 0;
 
   unsigned MinSignBits = TyBits;
   unsigned NumElts = CV->getType()->getVectorNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     // If we find a non-ConstantInt, bail out.
     auto *Elt = dyn_cast_or_null<ConstantInt>(CV->getAggregateElement(i));
     if (!Elt)
       return 0;
 
     MinSignBits = std::min(MinSignBits, Elt->getValue().getNumSignBits());
   }
 
   return MinSignBits;
 }
 
 static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
                                        const Query &Q);
 
 static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
                                    const Query &Q) {
   unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
   assert(Result > 0 && "At least one sign bit needs to be present!");
   return Result;
 }
 
 /// Return the number of times the sign bit of the register is replicated into
 /// the other bits. We know that at least 1 bit is always equal to the sign bit
 /// (itself), but other cases can give us information. For example, immediately
 /// after an "ashr X, 2", we know that the top 3 bits are all equal to each
 /// other, so we return 3. For vectors, return the number of sign bits for the
 /// vector element with the mininum number of known sign bits.
 static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
                                        const Query &Q) {
   assert(Depth <= MaxDepth && "Limit Search Depth");
 
   // We return the minimum number of sign bits that are guaranteed to be present
   // in V, so for undef we have to conservatively return 1.  We don't have the
   // same behavior for poison though -- that's a FIXME today.
 
   unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType());
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
 
   // Note that ConstantInt is handled by the general computeKnownBits case
   // below.
 
   if (Depth == MaxDepth)
     return 1;  // Limit search depth.
 
   const Operator *U = dyn_cast<Operator>(V);
   switch (Operator::getOpcode(V)) {
   default: break;
   case Instruction::SExt:
     Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
     return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp;
 
   case Instruction::SDiv: {
     const APInt *Denominator;
     // sdiv X, C -> adds log(C) sign bits.
     if (match(U->getOperand(1), m_APInt(Denominator))) {
 
       // Ignore non-positive denominator.
       if (!Denominator->isStrictlyPositive())
         break;
 
       // Calculate the incoming numerator bits.
       unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
       // Add floor(log(C)) bits to the numerator bits.
       return std::min(TyBits, NumBits + Denominator->logBase2());
     }
     break;
   }
 
   case Instruction::SRem: {
     const APInt *Denominator;
     // srem X, C -> we know that the result is within [-C+1,C) when C is a
     // positive constant.  This let us put a lower bound on the number of sign
     // bits.
     if (match(U->getOperand(1), m_APInt(Denominator))) {
 
       // Ignore non-positive denominator.
       if (!Denominator->isStrictlyPositive())
         break;
 
       // Calculate the incoming numerator bits. SRem by a positive constant
       // can't lower the number of sign bits.
       unsigned NumrBits =
           ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
       // Calculate the leading sign bit constraints by examining the
       // denominator.  Given that the denominator is positive, there are two
       // cases:
       //
       //  1. the numerator is positive.  The result range is [0,C) and [0,C) u<
       //     (1 << ceilLogBase2(C)).
       //
       //  2. the numerator is negative.  Then the result range is (-C,0] and
       //     integers in (-C,0] are either 0 or >u (-1 << ceilLogBase2(C)).
       //
       // Thus a lower bound on the number of sign bits is `TyBits -
       // ceilLogBase2(C)`.
 
       unsigned ResBits = TyBits - Denominator->ceilLogBase2();
       return std::max(NumrBits, ResBits);
     }
     break;
   }
 
   case Instruction::AShr: {
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     // ashr X, C   -> adds C sign bits.  Vectors too.
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
       if (ShAmt->uge(TyBits))
         break;  // Bad shift.
       unsigned ShAmtLimited = ShAmt->getZExtValue();
       Tmp += ShAmtLimited;
       if (Tmp > TyBits) Tmp = TyBits;
     }
     return Tmp;
   }
   case Instruction::Shl: {
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
       // shl destroys sign bits.
       Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
       if (ShAmt->uge(TyBits) ||      // Bad shift.
           ShAmt->uge(Tmp)) break;    // Shifted all sign bits out.
       Tmp2 = ShAmt->getZExtValue();
       return Tmp - Tmp2;
     }
     break;
   }
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:    // NOT is handled here.
     // Logical binary ops preserve the number of sign bits at the worst.
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp != 1) {
       Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
       FirstAnswer = std::min(Tmp, Tmp2);
       // We computed what we know about the sign bits as our first
       // answer. Now proceed to the generic code that uses
       // computeKnownBits, and pick whichever answer is better.
     }
     break;
 
   case Instruction::Select:
     Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp == 1) return 1;  // Early out.
     Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
     return std::min(Tmp, Tmp2);
 
   case Instruction::Add:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp == 1) return 1;  // Early out.
 
     // Special case decrementing a value (ADD X, -1):
     if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
       if (CRHS->isAllOnesValue()) {
         KnownBits Known(TyBits);
         computeKnownBits(U->getOperand(0), Known, Depth + 1, Q);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((Known.Zero | 1).isAllOnesValue())
           return TyBits;
 
         // If we are subtracting one from a positive number, there is no carry
         // out of the result.
         if (Known.isNonNegative())
           return Tmp;
       }
 
     Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp2 == 1) return 1;
     return std::min(Tmp, Tmp2)-1;
 
   case Instruction::Sub:
     Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (Tmp2 == 1) return 1;
 
     // Handle NEG.
     if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
       if (CLHS->isNullValue()) {
         KnownBits Known(TyBits);
         computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((Known.Zero | 1).isAllOnesValue())
           return TyBits;
 
         // If the input is known to be positive (the sign bit is known clear),
         // the output of the NEG has the same number of sign bits as the input.
         if (Known.isNonNegative())
           return Tmp2;
 
         // Otherwise, we treat this like a SUB.
       }
 
     // Sub can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (Tmp == 1) return 1;  // Early out.
     return std::min(Tmp, Tmp2)-1;
 
   case Instruction::Mul: {
     // The output of the Mul can be at most twice the valid bits in the inputs.
     unsigned SignBitsOp0 = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
     if (SignBitsOp0 == 1) return 1;  // Early out.
     unsigned SignBitsOp1 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
     if (SignBitsOp1 == 1) return 1;
     unsigned OutValidBits =
         (TyBits - SignBitsOp0 + 1) + (TyBits - SignBitsOp1 + 1);
     return OutValidBits > TyBits ? 1 : TyBits - OutValidBits + 1;
   }
 
   case Instruction::PHI: {
     const PHINode *PN = cast<PHINode>(U);
     unsigned NumIncomingValues = PN->getNumIncomingValues();
     // Don't analyze large in-degree PHIs.
     if (NumIncomingValues > 4) break;
     // Unreachable blocks may have zero-operand PHI nodes.
     if (NumIncomingValues == 0) break;
 
     // Take the minimum of all incoming values.  This can't infinitely loop
     // because of our depth threshold.
     Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q);
     for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
       if (Tmp == 1) return Tmp;
       Tmp = std::min(
           Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q));
     }
     return Tmp;
   }
 
   case Instruction::Trunc:
     // FIXME: it's tricky to do anything useful for this, but it is an important
     // case for targets like X86.
     break;
 
   case Instruction::ExtractElement:
     // Look through extract element. At the moment we keep this simple and skip
     // tracking the specific element. But at least we might find information
     // valid for all elements of the vector (for example if vector is sign
     // extended, shifted, etc).
     return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
 
   // If we can examine all elements of a vector constant successfully, we're
   // done (we can't do any better than that). If not, keep trying.
   if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits))
     return VecSignBits;
 
   KnownBits Known(TyBits);
   computeKnownBits(V, Known, Depth, Q);
 
   // If we know that the sign bit is either zero or one, determine the number of
   // identical bits in the top of the input value.
   return std::max(FirstAnswer, Known.countMinSignBits());
 }
 
 /// This function computes the integer multiple of Base that equals V.
 /// If successful, it returns true and returns the multiple in
 /// Multiple. If unsuccessful, it returns false. It looks
 /// through SExt instructions only if LookThroughSExt is true.
 bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
                            bool LookThroughSExt, unsigned Depth) {
   const unsigned MaxDepth = 6;
 
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");
 
   Type *T = V->getType();
 
   ConstantInt *CI = dyn_cast<ConstantInt>(V);
 
   if (Base == 0)
     return false;
 
   if (Base == 1) {
     Multiple = V;
     return true;
   }
 
   ConstantExpr *CO = dyn_cast<ConstantExpr>(V);
   Constant *BaseVal = ConstantInt::get(T, Base);
   if (CO && CO == BaseVal) {
     // Multiple is 1.
     Multiple = ConstantInt::get(T, 1);
     return true;
   }
 
   if (CI && CI->getZExtValue() % Base == 0) {
     Multiple = ConstantInt::get(T, CI->getZExtValue() / Base);
     return true;
   }
 
   if (Depth == MaxDepth) return false;  // Limit search depth.
 
   Operator *I = dyn_cast<Operator>(V);
   if (!I) return false;
 
   switch (I->getOpcode()) {
   default: break;
   case Instruction::SExt:
     if (!LookThroughSExt) return false;
     // otherwise fall through to ZExt
     LLVM_FALLTHROUGH;
   case Instruction::ZExt:
     return ComputeMultiple(I->getOperand(0), Base, Multiple,
                            LookThroughSExt, Depth+1);
   case Instruction::Shl:
   case Instruction::Mul: {
     Value *Op0 = I->getOperand(0);
     Value *Op1 = I->getOperand(1);
 
     if (I->getOpcode() == Instruction::Shl) {
       ConstantInt *Op1CI = dyn_cast<ConstantInt>(Op1);
       if (!Op1CI) return false;
       // Turn Op0 << Op1 into Op0 * 2^Op1
       APInt Op1Int = Op1CI->getValue();
       uint64_t BitToSet = Op1Int.getLimitedValue(Op1Int.getBitWidth() - 1);
       APInt API(Op1Int.getBitWidth(), 0);
       API.setBit(BitToSet);
       Op1 = ConstantInt::get(V->getContext(), API);
     }
 
     Value *Mul0 = nullptr;
     if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
       if (Constant *Op1C = dyn_cast<Constant>(Op1))
         if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
           if (Op1C->getType()->getPrimitiveSizeInBits() <
               MulC->getType()->getPrimitiveSizeInBits())
             Op1C = ConstantExpr::getZExt(Op1C, MulC->getType());
           if (Op1C->getType()->getPrimitiveSizeInBits() >
               MulC->getType()->getPrimitiveSizeInBits())
             MulC = ConstantExpr::getZExt(MulC, Op1C->getType());
 
           // V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
           Multiple = ConstantExpr::getMul(MulC, Op1C);
           return true;
         }
 
       if (ConstantInt *Mul0CI = dyn_cast<ConstantInt>(Mul0))
         if (Mul0CI->getValue() == 1) {
           // V == Base * Op1, so return Op1
           Multiple = Op1;
           return true;
         }
     }
 
     Value *Mul1 = nullptr;
     if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
       if (Constant *Op0C = dyn_cast<Constant>(Op0))
         if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
           if (Op0C->getType()->getPrimitiveSizeInBits() <
               MulC->getType()->getPrimitiveSizeInBits())
             Op0C = ConstantExpr::getZExt(Op0C, MulC->getType());
           if (Op0C->getType()->getPrimitiveSizeInBits() >
               MulC->getType()->getPrimitiveSizeInBits())
             MulC = ConstantExpr::getZExt(MulC, Op0C->getType());
 
           // V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
           Multiple = ConstantExpr::getMul(MulC, Op0C);
           return true;
         }
 
       if (ConstantInt *Mul1CI = dyn_cast<ConstantInt>(Mul1))
         if (Mul1CI->getValue() == 1) {
           // V == Base * Op0, so return Op0
           Multiple = Op0;
           return true;
         }
     }
   }
   }
 
   // We could not determine if V is a multiple of Base.
   return false;
 }
 
 Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
                                             const TargetLibraryInfo *TLI) {
   const Function *F = ICS.getCalledFunction();
   if (!F)
     return Intrinsic::not_intrinsic;
 
   if (F->isIntrinsic())
     return F->getIntrinsicID();
 
   if (!TLI)
     return Intrinsic::not_intrinsic;
 
   LibFunc Func;
   // We're going to make assumptions on the semantics of the functions, check
   // that the target knows that it's available in this environment and it does
   // not have local linkage.
   if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(*F, Func))
     return Intrinsic::not_intrinsic;
 
   if (!ICS.onlyReadsMemory())
     return Intrinsic::not_intrinsic;
 
   // Otherwise check if we have a call to a function that can be turned into a
   // vector intrinsic.
   switch (Func) {
   default:
     break;
   case LibFunc_sin:
   case LibFunc_sinf:
   case LibFunc_sinl:
     return Intrinsic::sin;
   case LibFunc_cos:
   case LibFunc_cosf:
   case LibFunc_cosl:
     return Intrinsic::cos;
   case LibFunc_exp:
   case LibFunc_expf:
   case LibFunc_expl:
     return Intrinsic::exp;
   case LibFunc_exp2:
   case LibFunc_exp2f:
   case LibFunc_exp2l:
     return Intrinsic::exp2;
   case LibFunc_log:
   case LibFunc_logf:
   case LibFunc_logl:
     return Intrinsic::log;
   case LibFunc_log10:
   case LibFunc_log10f:
   case LibFunc_log10l:
     return Intrinsic::log10;
   case LibFunc_log2:
   case LibFunc_log2f:
   case LibFunc_log2l:
     return Intrinsic::log2;
   case LibFunc_fabs:
   case LibFunc_fabsf:
   case LibFunc_fabsl:
     return Intrinsic::fabs;
   case LibFunc_fmin:
   case LibFunc_fminf:
   case LibFunc_fminl:
     return Intrinsic::minnum;
   case LibFunc_fmax:
   case LibFunc_fmaxf:
   case LibFunc_fmaxl:
     return Intrinsic::maxnum;
   case LibFunc_copysign:
   case LibFunc_copysignf:
   case LibFunc_copysignl:
     return Intrinsic::copysign;
   case LibFunc_floor:
   case LibFunc_floorf:
   case LibFunc_floorl:
     return Intrinsic::floor;
   case LibFunc_ceil:
   case LibFunc_ceilf:
   case LibFunc_ceill:
     return Intrinsic::ceil;
   case LibFunc_trunc:
   case LibFunc_truncf:
   case LibFunc_truncl:
     return Intrinsic::trunc;
   case LibFunc_rint:
   case LibFunc_rintf:
   case LibFunc_rintl:
     return Intrinsic::rint;
   case LibFunc_nearbyint:
   case LibFunc_nearbyintf:
   case LibFunc_nearbyintl:
     return Intrinsic::nearbyint;
   case LibFunc_round:
   case LibFunc_roundf:
   case LibFunc_roundl:
     return Intrinsic::round;
   case LibFunc_pow:
   case LibFunc_powf:
   case LibFunc_powl:
     return Intrinsic::pow;
   case LibFunc_sqrt:
   case LibFunc_sqrtf:
   case LibFunc_sqrtl:
     return Intrinsic::sqrt;
   }
 
   return Intrinsic::not_intrinsic;
 }
 
 /// Return true if we can prove that the specified FP value is never equal to
 /// -0.0.
 ///
 /// NOTE: this function will need to be revisited when we support non-default
 /// rounding modes!
 bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
                                 unsigned Depth) {
   if (auto *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->getValueAPF().isNegZero();
 
   // Limit search depth.
   if (Depth == MaxDepth)
     return false;
 
   auto *Op = dyn_cast<Operator>(V);
   if (!Op)
     return false;
 
   // Check if the nsz fast-math flag is set.
   if (auto *FPO = dyn_cast<FPMathOperator>(Op))
     if (FPO->hasNoSignedZeros())
       return true;
 
   // (fadd x, 0.0) is guaranteed to return +0.0, not -0.0.
   if (match(Op, m_FAdd(m_Value(), m_Zero())))
     return true;
 
   // sitofp and uitofp turn into +0.0 for zero.
   if (isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op))
     return true;
 
   if (auto *Call = dyn_cast<CallInst>(Op)) {
     Intrinsic::ID IID = getIntrinsicForCallSite(Call, TLI);
     switch (IID) {
     default:
       break;
     // sqrt(-0.0) = -0.0, no other negative results are possible.
     case Intrinsic::sqrt:
       return CannotBeNegativeZero(Call->getArgOperand(0), TLI, Depth + 1);
     // fabs(x) != -0.0
     case Intrinsic::fabs:
       return true;
     }
   }
 
   return false;
 }
 
 /// If \p SignBitOnly is true, test for a known 0 sign bit rather than a
 /// standard ordered compare. e.g. make -0.0 olt 0.0 be true because of the sign
 /// bit despite comparing equal.
 static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
                                             const TargetLibraryInfo *TLI,
                                             bool SignBitOnly,
                                             unsigned Depth) {
   // TODO: This function does not do the right thing when SignBitOnly is true
   // and we're lowering to a hypothetical IEEE 754-compliant-but-evil platform
   // which flips the sign bits of NaNs.  See
   // https://llvm.org/bugs/show_bug.cgi?id=31702.
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     return !CFP->getValueAPF().isNegative() ||
            (!SignBitOnly && CFP->getValueAPF().isZero());
   }
 
   if (Depth == MaxDepth)
     return false; // Limit search depth.
 
   const Operator *I = dyn_cast<Operator>(V);
   if (!I)
     return false;
 
   switch (I->getOpcode()) {
   default:
     break;
   // Unsigned integers are always nonnegative.
   case Instruction::UIToFP:
     return true;
   case Instruction::FMul:
     // x*x is always non-negative or a NaN.
     if (I->getOperand(0) == I->getOperand(1) &&
         (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()))
       return true;
 
     LLVM_FALLTHROUGH;
   case Instruction::FAdd:
   case Instruction::FDiv:
   case Instruction::FRem:
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1) &&
            cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::Select:
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
                                            Depth + 1) &&
            cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::FPExt:
   case Instruction::FPTrunc:
     // Widening/narrowing never change sign.
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1);
   case Instruction::Call:
     const auto *CI = cast<CallInst>(I);
     Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
     switch (IID) {
     default:
       break;
     case Intrinsic::maxnum:
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1) ||
              cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
                                              Depth + 1);
     case Intrinsic::minnum:
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1) &&
              cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
                                              Depth + 1);
     case Intrinsic::exp:
     case Intrinsic::exp2:
     case Intrinsic::fabs:
       return true;
 
     case Intrinsic::sqrt:
       // sqrt(x) is always >= -0 or NaN.  Moreover, sqrt(x) == -0 iff x == -0.
       if (!SignBitOnly)
         return true;
       return CI->hasNoNaNs() && (CI->hasNoSignedZeros() ||
                                  CannotBeNegativeZero(CI->getOperand(0), TLI));
 
     case Intrinsic::powi:
       if (ConstantInt *Exponent = dyn_cast<ConstantInt>(I->getOperand(1))) {
         // powi(x,n) is non-negative if n is even.
         if (Exponent->getBitWidth() <= 64 && Exponent->getSExtValue() % 2u == 0)
           return true;
       }
       // TODO: This is not correct.  Given that exp is an integer, here are the
       // ways that pow can return a negative value:
       //
       //   pow(x, exp)    --> negative if exp is odd and x is negative.
       //   pow(-0, exp)   --> -inf if exp is negative odd.
       //   pow(-0, exp)   --> -0 if exp is positive odd.
       //   pow(-inf, exp) --> -0 if exp is negative odd.
       //   pow(-inf, exp) --> -inf if exp is positive odd.
       //
       // Therefore, if !SignBitOnly, we can return true if x >= +0 or x is NaN,
       // but we must return false if x == -0.  Unfortunately we do not currently
       // have a way of expressing this constraint.  See details in
       // https://llvm.org/bugs/show_bug.cgi?id=31702.
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                              Depth + 1);
 
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
       // x*x+y is non-negative if y is non-negative.
       return I->getOperand(0) == I->getOperand(1) &&
              (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()) &&
              cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
                                              Depth + 1);
     }
     break;
   }
   return false;
 }
 
 bool llvm::CannotBeOrderedLessThanZero(const Value *V,
                                        const TargetLibraryInfo *TLI) {
   return cannotBeOrderedLessThanZeroImpl(V, TLI, false, 0);
 }
 
 bool llvm::SignBitMustBeZero(const Value *V, const TargetLibraryInfo *TLI) {
   return cannotBeOrderedLessThanZeroImpl(V, TLI, true, 0);
 }
 
 bool llvm::isKnownNeverNaN(const Value *V) {
   assert(V->getType()->isFPOrFPVectorTy() && "Querying for NaN on non-FP type");
 
   // If we're told that NaNs won't happen, assume they won't.
   if (auto *FPMathOp = dyn_cast<FPMathOperator>(V))
     if (FPMathOp->hasNoNaNs())
       return true;
 
   // TODO: Handle instructions and potentially recurse like other 'isKnown'
   // functions. For example, the result of sitofp is never NaN.
 
   // Handle scalar constants.
   if (auto *CFP = dyn_cast<ConstantFP>(V))
     return !CFP->isNaN();
 
   // Bail out for constant expressions, but try to handle vector constants.
   if (!V->getType()->isVectorTy() || !isa<Constant>(V))
     return false;
 
   // For vectors, verify that each element is not NaN.
   unsigned NumElts = V->getType()->getVectorNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
     if (!Elt)
       return false;
     if (isa<UndefValue>(Elt))
       continue;
     auto *CElt = dyn_cast<ConstantFP>(Elt);
     if (!CElt || CElt->isNaN())
       return false;
   }
   // All elements were confirmed not-NaN or undefined.
   return true;
 }
 
 /// If the specified value can be set by repeating the same byte in memory,
 /// return the i8 value that it is represented with.  This is
 /// true for all i8 values obviously, but is also true for i32 0, i32 -1,
 /// i16 0xF0F0, double 0.0 etc.  If the value can't be handled with a repeated
 /// byte store (e.g. i16 0x1234), return null.
 Value *llvm::isBytewiseValue(Value *V) {
   // All byte-wide stores are splatable, even of arbitrary variables.
   if (V->getType()->isIntegerTy(8)) return V;
 
   // Handle 'null' ConstantArrayZero etc.
   if (Constant *C = dyn_cast<Constant>(V))
     if (C->isNullValue())
       return Constant::getNullValue(Type::getInt8Ty(V->getContext()));
 
   // Constant float and double values can be handled as integer values if the
   // corresponding integer value is "byteable".  An important case is 0.0.
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     if (CFP->getType()->isFloatTy())
       V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext()));
     if (CFP->getType()->isDoubleTy())
       V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext()));
     // Don't handle long double formats, which have strange constraints.
   }
 
   // We can handle constant integers that are multiple of 8 bits.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getBitWidth() % 8 == 0) {
       assert(CI->getBitWidth() > 8 && "8 bits should be handled above!");
 
       if (!CI->getValue().isSplat(8))
         return nullptr;
       return ConstantInt::get(V->getContext(), CI->getValue().trunc(8));
     }
   }
 
   // A ConstantDataArray/Vector is splatable if all its members are equal and
   // also splatable.
   if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(V)) {
     Value *Elt = CA->getElementAsConstant(0);
     Value *Val = isBytewiseValue(Elt);
     if (!Val)
       return nullptr;
 
     for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I)
       if (CA->getElementAsConstant(I) != Elt)
         return nullptr;
 
     return Val;
   }
 
   // Conceptually, we could handle things like:
   //   %a = zext i8 %X to i16
   //   %b = shl i16 %a, 8
   //   %c = or i16 %a, %b
   // but until there is an example that actually needs this, it doesn't seem
   // worth worrying about.
   return nullptr;
 }
 
 // This is the recursive version of BuildSubAggregate. It takes a few different
 // arguments. Idxs is the index within the nested struct From that we are
 // looking at now (which is of type IndexedType). IdxSkip is the number of
 // indices from Idxs that should be left out when inserting into the resulting
 // struct. To is the result struct built so far, new insertvalue instructions
 // build on that.
 static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType,
                                 SmallVectorImpl<unsigned> &Idxs,
                                 unsigned IdxSkip,
                                 Instruction *InsertBefore) {
   StructType *STy = dyn_cast<StructType>(IndexedType);
   if (STy) {
     // Save the original To argument so we can modify it
     Value *OrigTo = To;
     // General case, the type indexed by Idxs is a struct
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
       // Process each struct element recursively
       Idxs.push_back(i);
       Value *PrevTo = To;
       To = BuildSubAggregate(From, To, STy->getElementType(i), Idxs, IdxSkip,
                              InsertBefore);
       Idxs.pop_back();
       if (!To) {
         // Couldn't find any inserted value for this index? Cleanup
         while (PrevTo != OrigTo) {
           InsertValueInst* Del = cast<InsertValueInst>(PrevTo);
           PrevTo = Del->getAggregateOperand();
           Del->eraseFromParent();
         }
         // Stop processing elements
         break;
       }
     }
     // If we successfully found a value for each of our subaggregates
     if (To)
       return To;
   }
   // Base case, the type indexed by SourceIdxs is not a struct, or not all of
   // the struct's elements had a value that was inserted directly. In the latter
   // case, perhaps we can't determine each of the subelements individually, but
   // we might be able to find the complete struct somewhere.
 
   // Find the value that is at that particular spot
   Value *V = FindInsertedValue(From, Idxs);
 
   if (!V)
     return nullptr;
 
   // Insert the value in the new (sub) aggregrate
   return InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip),
                                  "tmp", InsertBefore);
 }
 
 // This helper takes a nested struct and extracts a part of it (which is again a
 // struct) into a new value. For example, given the struct:
 // { a, { b, { c, d }, e } }
 // and the indices "1, 1" this returns
 // { c, d }.
 //
 // It does this by inserting an insertvalue for each element in the resulting
 // struct, as opposed to just inserting a single struct. This will only work if
 // each of the elements of the substruct are known (ie, inserted into From by an
 // insertvalue instruction somewhere).
 //
 // All inserted insertvalue instructions are inserted before InsertBefore
 static Value *BuildSubAggregate(Value *From, ArrayRef<unsigned> idx_range,
                                 Instruction *InsertBefore) {
   assert(InsertBefore && "Must have someplace to insert!");
   Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(),
                                                              idx_range);
   Value *To = UndefValue::get(IndexedType);
   SmallVector<unsigned, 10> Idxs(idx_range.begin(), idx_range.end());
   unsigned IdxSkip = Idxs.size();
 
   return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore);
 }
 
 /// Given an aggregrate and an sequence of indices, see if
 /// the scalar value indexed is already around as a register, for example if it
 /// were inserted directly into the aggregrate.
 ///
 /// If InsertBefore is not null, this function will duplicate (modified)
 /// insertvalues when a part of a nested struct is extracted.
 Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
                                Instruction *InsertBefore) {
   // Nothing to index? Just return V then (this is useful at the end of our
   // recursion).
   if (idx_range.empty())
     return V;
   // We have indices, so V should have an indexable type.
   assert((V->getType()->isStructTy() || V->getType()->isArrayTy()) &&
          "Not looking at a struct or array?");
   assert(ExtractValueInst::getIndexedType(V->getType(), idx_range) &&
          "Invalid indices for type?");
 
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = C->getAggregateElement(idx_range[0]);
     if (!C) return nullptr;
     return FindInsertedValue(C, idx_range.slice(1), InsertBefore);
   }
 
   if (InsertValueInst *I = dyn_cast<InsertValueInst>(V)) {
     // Loop the indices for the insertvalue instruction in parallel with the
     // requested indices
     const unsigned *req_idx = idx_range.begin();
     for (const unsigned *i = I->idx_begin(), *e = I->idx_end();
          i != e; ++i, ++req_idx) {
       if (req_idx == idx_range.end()) {
         // We can't handle this without inserting insertvalues
         if (!InsertBefore)
           return nullptr;
 
         // The requested index identifies a part of a nested aggregate. Handle
         // this specially. For example,
         // %A = insertvalue { i32, {i32, i32 } } undef, i32 10, 1, 0
         // %B = insertvalue { i32, {i32, i32 } } %A, i32 11, 1, 1
         // %C = extractvalue {i32, { i32, i32 } } %B, 1
         // This can be changed into
         // %A = insertvalue {i32, i32 } undef, i32 10, 0
         // %C = insertvalue {i32, i32 } %A, i32 11, 1
         // which allows the unused 0,0 element from the nested struct to be
         // removed.
         return BuildSubAggregate(V, makeArrayRef(idx_range.begin(), req_idx),
                                  InsertBefore);
       }
 
       // This insert value inserts something else than what we are looking for.
       // See if the (aggregate) value inserted into has the value we are
       // looking for, then.
       if (*req_idx != *i)
         return FindInsertedValue(I->getAggregateOperand(), idx_range,
                                  InsertBefore);
     }
     // If we end up here, the indices of the insertvalue match with those
     // requested (though possibly only partially). Now we recursively look at
     // the inserted value, passing any remaining indices.
     return FindInsertedValue(I->getInsertedValueOperand(),
                              makeArrayRef(req_idx, idx_range.end()),
                              InsertBefore);
   }
 
   if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) {
     // If we're extracting a value from an aggregate that was extracted from
     // something else, we can extract from that something else directly instead.
     // However, we will need to chain I's indices with the requested indices.
 
     // Calculate the number of indices required
     unsigned size = I->getNumIndices() + idx_range.size();
     // Allocate some space to put the new indices in
     SmallVector<unsigned, 5> Idxs;
     Idxs.reserve(size);
     // Add indices from the extract value instruction
     Idxs.append(I->idx_begin(), I->idx_end());
 
     // Add requested indices
     Idxs.append(idx_range.begin(), idx_range.end());
 
     assert(Idxs.size() == size
            && "Number of indices added not correct?");
 
     return FindInsertedValue(I->getAggregateOperand(), Idxs, InsertBefore);
   }
   // Otherwise, we don't know (such as, extracting from a function return value
   // or load instruction)
   return nullptr;
 }
 
 /// Analyze the specified pointer to see if it can be expressed as a base
 /// pointer plus a constant offset. Return the base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
                                               const DataLayout &DL) {
   unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType());
   APInt ByteOffset(BitWidth, 0);
 
   // We walk up the defs but use a visited set to handle unreachable code. In
   // that case, we stop after accumulating the cycle once (not that it
   // matters).
   SmallPtrSet<Value *, 16> Visited;
   while (Visited.insert(Ptr).second) {
     if (Ptr->getType()->isVectorTy())
       break;
 
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
       // If one of the values we have visited is an addrspacecast, then
       // the pointer type of this GEP may be different from the type
       // of the Ptr parameter which was passed to this function.  This
       // means when we construct GEPOffset, we need to use the size
       // of GEP's pointer type rather than the size of the original
       // pointer type.
       APInt GEPOffset(DL.getPointerTypeSizeInBits(Ptr->getType()), 0);
       if (!GEP->accumulateConstantOffset(DL, GEPOffset))
         break;
 
       ByteOffset += GEPOffset.getSExtValue();
 
       Ptr = GEP->getPointerOperand();
     } else if (Operator::getOpcode(Ptr) == Instruction::BitCast ||
                Operator::getOpcode(Ptr) == Instruction::AddrSpaceCast) {
       Ptr = cast<Operator>(Ptr)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
       if (GA->isInterposable())
         break;
       Ptr = GA->getAliasee();
     } else {
       break;
     }
   }
   Offset = ByteOffset.getSExtValue();
   return Ptr;
 }
 
 bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
                                        unsigned CharSize) {
   // Make sure the GEP has exactly three arguments.
   if (GEP->getNumOperands() != 3)
     return false;
 
   // Make sure the index-ee is a pointer to array of \p CharSize integers.
   // CharSize.
   ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
   if (!AT || !AT->getElementType()->isIntegerTy(CharSize))
     return false;
 
   // Check to make sure that the first operand of the GEP is an integer and
   // has value 0 so that we are sure we're indexing into the initializer.
   const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
   if (!FirstIdx || !FirstIdx->isZero())
     return false;
 
   return true;
 }
 
 bool llvm::getConstantDataArrayInfo(const Value *V,
                                     ConstantDataArraySlice &Slice,
                                     unsigned ElementSize, uint64_t Offset) {
   assert(V);
 
   // Look through bitcast instructions and geps.
   V = V->stripPointerCasts();
 
   // If the value is a GEP instruction or constant expression, treat it as an
   // offset.
   if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     // The GEP operator should be based on a pointer to string constant, and is
     // indexing into the string constant.
     if (!isGEPBasedOnPointerToString(GEP, ElementSize))
       return false;
 
     // If the second index isn't a ConstantInt, then this is a variable index
     // into the array.  If this occurs, we can't say anything meaningful about
     // the string.
     uint64_t StartIdx = 0;
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
       StartIdx = CI->getZExtValue();
     else
       return false;
     return getConstantDataArrayInfo(GEP->getOperand(0), Slice, ElementSize,
                                     StartIdx + Offset);
   }
 
   // The GEP instruction, constant or instruction, must reference a global
   // variable that is a constant and is initialized. The referenced constant
   // initializer is the array that we'll use for optimization.
   const GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
     return false;
 
   const ConstantDataArray *Array;
   ArrayType *ArrayTy;
   if (GV->getInitializer()->isNullValue()) {
     Type *GVTy = GV->getValueType();
     if ( (ArrayTy = dyn_cast<ArrayType>(GVTy)) ) {
       // A zeroinitializer for the array; there is no ConstantDataArray.
       Array = nullptr;
     } else {
       const DataLayout &DL = GV->getParent()->getDataLayout();
       uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy);
       uint64_t Length = SizeInBytes / (ElementSize / 8);
       if (Length <= Offset)
         return false;
 
       Slice.Array = nullptr;
       Slice.Offset = 0;
       Slice.Length = Length - Offset;
       return true;
     }
   } else {
     // This must be a ConstantDataArray.
     Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
     if (!Array)
       return false;
     ArrayTy = Array->getType();
   }
   if (!ArrayTy->getElementType()->isIntegerTy(ElementSize))
     return false;
 
   uint64_t NumElts = ArrayTy->getArrayNumElements();
   if (Offset > NumElts)
     return false;
 
   Slice.Array = Array;
   Slice.Offset = Offset;
   Slice.Length = NumElts - Offset;
   return true;
 }
 
 /// This function computes the length of a null-terminated C string pointed to
 /// by V. If successful, it returns true and returns the string in Str.
 /// If unsuccessful, it returns false.
 bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
                                  uint64_t Offset, bool TrimAtNul) {
   ConstantDataArraySlice Slice;
   if (!getConstantDataArrayInfo(V, Slice, 8, Offset))
     return false;
 
   if (Slice.Array == nullptr) {
     if (TrimAtNul) {
       Str = StringRef();
       return true;
     }
     if (Slice.Length == 1) {
       Str = StringRef("", 1);
       return true;
     }
     // We cannot instantiate a StringRef as we do not have an appropriate string
     // of 0s at hand.
     return false;
   }
 
   // Start out with the entire array in the StringRef.
   Str = Slice.Array->getAsString();
   // Skip over 'offset' bytes.
   Str = Str.substr(Slice.Offset);
 
   if (TrimAtNul) {
     // Trim off the \0 and anything after it.  If the array is not nul
     // terminated, we just return the whole end of string.  The client may know
     // some other way that the string is length-bound.
     Str = Str.substr(0, Str.find('\0'));
   }
   return true;
 }
 
 // These next two are very similar to the above, but also look through PHI
 // nodes.
 // TODO: See if we can integrate these two together.
 
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
 static uint64_t GetStringLengthH(const Value *V,
                                  SmallPtrSetImpl<const PHINode*> &PHIs,
                                  unsigned CharSize) {
   // Look through noop bitcast instructions.
   V = V->stripPointerCasts();
 
   // If this is a PHI node, there are two cases: either we have already seen it
   // or we haven't.
   if (const PHINode *PN = dyn_cast<PHINode>(V)) {
     if (!PHIs.insert(PN).second)
       return ~0ULL;  // already in the set.
 
     // If it was new, see if all the input strings are the same length.
     uint64_t LenSoFar = ~0ULL;
     for (Value *IncValue : PN->incoming_values()) {
       uint64_t Len = GetStringLengthH(IncValue, PHIs, CharSize);
       if (Len == 0) return 0; // Unknown length -> unknown.
 
       if (Len == ~0ULL) continue;
 
       if (Len != LenSoFar && LenSoFar != ~0ULL)
         return 0;    // Disagree -> unknown.
       LenSoFar = Len;
     }
 
     // Success, all agree.
     return LenSoFar;
   }
 
   // strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
   if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
     uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs, CharSize);
     if (Len1 == 0) return 0;
     uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs, CharSize);
     if (Len2 == 0) return 0;
     if (Len1 == ~0ULL) return Len2;
     if (Len2 == ~0ULL) return Len1;
     if (Len1 != Len2) return 0;
     return Len1;
   }
 
   // Otherwise, see if we can read the string.
   ConstantDataArraySlice Slice;
   if (!getConstantDataArrayInfo(V, Slice, CharSize))
     return 0;
 
   if (Slice.Array == nullptr)
     return 1;
 
   // Search for nul characters
   unsigned NullIndex = 0;
   for (unsigned E = Slice.Length; NullIndex < E; ++NullIndex) {
     if (Slice.Array->getElementAsInteger(Slice.Offset + NullIndex) == 0)
       break;
   }
 
   return NullIndex + 1;
 }
 
 /// If we can compute the length of the string pointed to by
 /// the specified pointer, return 'len+1'.  If we can't, return 0.
 uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
   if (!V->getType()->isPointerTy()) return 0;
 
   SmallPtrSet<const PHINode*, 32> PHIs;
   uint64_t Len = GetStringLengthH(V, PHIs, CharSize);
   // If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
   // an empty string as a length.
   return Len == ~0ULL ? 1 : Len;
 }
 
 /// \brief \p PN defines a loop-variant pointer to an object.  Check if the
 /// previous iteration of the loop was referring to the same object as \p PN.
 static bool isSameUnderlyingObjectInLoop(const PHINode *PN,
                                          const LoopInfo *LI) {
   // Find the loop-defined value.
   Loop *L = LI->getLoopFor(PN->getParent());
   if (PN->getNumIncomingValues() != 2)
     return true;
 
   // Find the value from previous iteration.
   auto *PrevValue = dyn_cast<Instruction>(PN->getIncomingValue(0));
   if (!PrevValue || LI->getLoopFor(PrevValue->getParent()) != L)
     PrevValue = dyn_cast<Instruction>(PN->getIncomingValue(1));
   if (!PrevValue || LI->getLoopFor(PrevValue->getParent()) != L)
     return true;
 
   // If a new pointer is loaded in the loop, the pointer references a different
   // object in every iteration.  E.g.:
   //    for (i)
   //       int *p = a[i];
   //       ...
   if (auto *Load = dyn_cast<LoadInst>(PrevValue))
     if (!L->isLoopInvariant(Load->getPointerOperand()))
       return false;
   return true;
 }
 
 Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
                                  unsigned MaxLookup) {
   if (!V->getType()->isPointerTy())
     return V;
   for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->isInterposable())
         return V;
       V = GA->getAliasee();
     } else if (isa<AllocaInst>(V)) {
       // An alloca can't be further simplified.
       return V;
     } else {
       if (auto CS = CallSite(V))
         if (Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
 
       // See if InstructionSimplify knows any relevant tricks.
       if (Instruction *I = dyn_cast<Instruction>(V))
         // TODO: Acquire a DominatorTree and AssumptionCache and use them.
         if (Value *Simplified = SimplifyInstruction(I, {DL, I})) {
           V = Simplified;
           continue;
         }
 
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
   }
   return V;
 }
 
 void llvm::GetUnderlyingObjects(Value *V, SmallVectorImpl<Value *> &Objects,
                                 const DataLayout &DL, LoopInfo *LI,
                                 unsigned MaxLookup) {
   SmallPtrSet<Value *, 4> Visited;
   SmallVector<Value *, 4> Worklist;
   Worklist.push_back(V);
   do {
     Value *P = Worklist.pop_back_val();
     P = GetUnderlyingObject(P, DL, MaxLookup);
 
     if (!Visited.insert(P).second)
       continue;
 
     if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
       Worklist.push_back(SI->getTrueValue());
       Worklist.push_back(SI->getFalseValue());
       continue;
     }
 
     if (PHINode *PN = dyn_cast<PHINode>(P)) {
       // If this PHI changes the underlying object in every iteration of the
       // loop, don't look through it.  Consider:
       //   int **A;
       //   for (i) {
       //     Prev = Curr;     // Prev = PHI (Prev_0, Curr)
       //     Curr = A[i];
       //     *Prev, *Curr;
       //
       // Prev is tracking Curr one iteration behind so they refer to different
       // underlying objects.
       if (!LI || !LI->isLoopHeader(PN->getParent()) ||
           isSameUnderlyingObjectInLoop(PN, LI))
         for (Value *IncValue : PN->incoming_values())
           Worklist.push_back(IncValue);
       continue;
     }
 
     Objects.push_back(P);
   } while (!Worklist.empty());
 }
 
 /// This is the function that does the work of looking through basic
 /// ptrtoint+arithmetic+inttoptr sequences.
 static const Value *getUnderlyingObjectFromInt(const Value *V) {
   do {
     if (const Operator *U = dyn_cast<Operator>(V)) {
       // If we find a ptrtoint, we can transfer control back to the
       // regular getUnderlyingObjectFromInt.
       if (U->getOpcode() == Instruction::PtrToInt)
         return U->getOperand(0);
       // If we find an add of a constant, a multiplied value, or a phi, it's
       // likely that the other operand will lead us to the base
       // object. We don't have to worry about the case where the
       // object address is somehow being computed by the multiply,
       // because our callers only care when the result is an
       // identifiable object.
       if (U->getOpcode() != Instruction::Add ||
           (!isa<ConstantInt>(U->getOperand(1)) &&
            Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
            !isa<PHINode>(U->getOperand(1))))
         return V;
       V = U->getOperand(0);
     } else {
       return V;
     }
     assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
   } while (true);
 }
 
 /// This is a wrapper around GetUnderlyingObjects and adds support for basic
 /// ptrtoint+arithmetic+inttoptr sequences.
 /// It returns false if unidentified object is found in GetUnderlyingObjects.
 bool llvm::getUnderlyingObjectsForCodeGen(const Value *V,
                           SmallVectorImpl<Value *> &Objects,
                           const DataLayout &DL) {
   SmallPtrSet<const Value *, 16> Visited;
   SmallVector<const Value *, 4> Working(1, V);
   do {
     V = Working.pop_back_val();
 
     SmallVector<Value *, 4> Objs;
     GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
 
     for (Value *V : Objs) {
       if (!Visited.insert(V).second)
         continue;
       if (Operator::getOpcode(V) == Instruction::IntToPtr) {
         const Value *O =
           getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
         if (O->getType()->isPointerTy()) {
           Working.push_back(O);
           continue;
         }
       }
       // If GetUnderlyingObjects fails to find an identifiable object,
       // getUnderlyingObjectsForCodeGen also fails for safety.
       if (!isIdentifiedObject(V)) {
         Objects.clear();
         return false;
       }
       Objects.push_back(const_cast<Value *>(V));
     }
   } while (!Working.empty());
   return true;
 }
 
 /// Return true if the only users of this pointer are lifetime markers.
 bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
   for (const User *U : V->users()) {
     const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
     if (!II) return false;
 
     if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
         II->getIntrinsicID() != Intrinsic::lifetime_end)
       return false;
   }
   return true;
 }
 
 bool llvm::isSafeToSpeculativelyExecute(const Value *V,
                                         const Instruction *CtxI,
                                         const DominatorTree *DT) {
   const Operator *Inst = dyn_cast<Operator>(V);
   if (!Inst)
     return false;
 
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
     if (Constant *C = dyn_cast<Constant>(Inst->getOperand(i)))
       if (C->canTrap())
         return false;
 
   switch (Inst->getOpcode()) {
   default:
     return true;
   case Instruction::UDiv:
   case Instruction::URem: {
     // x / y is undefined if y == 0.
     const APInt *V;
     if (match(Inst->getOperand(1), m_APInt(V)))
       return *V != 0;
     return false;
   }
   case Instruction::SDiv:
   case Instruction::SRem: {
     // x / y is undefined if y == 0 or x == INT_MIN and y == -1
     const APInt *Numerator, *Denominator;
     if (!match(Inst->getOperand(1), m_APInt(Denominator)))
       return false;
     // We cannot hoist this division if the denominator is 0.
     if (*Denominator == 0)
       return false;
     // It's safe to hoist if the denominator is not 0 or -1.
     if (*Denominator != -1)
       return true;
     // At this point we know that the denominator is -1.  It is safe to hoist as
     // long we know that the numerator is not INT_MIN.
     if (match(Inst->getOperand(0), m_APInt(Numerator)))
       return !Numerator->isMinSignedValue();
     // The numerator *might* be MinSignedValue.
     return false;
   }
   case Instruction::Load: {
     const LoadInst *LI = cast<LoadInst>(Inst);
     if (!LI->isUnordered() ||
         // Speculative load may create a race that did not exist in the source.
         LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) ||
         // Speculative load may load data from dirty regions.
         LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
         LI->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
       return false;
     const DataLayout &DL = LI->getModule()->getDataLayout();
     return isDereferenceableAndAlignedPointer(LI->getPointerOperand(),
                                               LI->getAlignment(), DL, CtxI, DT);
   }
   case Instruction::Call: {
     auto *CI = cast<const CallInst>(Inst);
     const Function *Callee = CI->getCalledFunction();
 
     // The called function could have undefined behavior or side-effects, even
     // if marked readnone nounwind.
     return Callee && Callee->isSpeculatable();
   }
   case Instruction::VAArg:
   case Instruction::Alloca:
   case Instruction::Invoke:
   case Instruction::PHI:
   case Instruction::Store:
   case Instruction::Ret:
   case Instruction::Br:
   case Instruction::IndirectBr:
   case Instruction::Switch:
   case Instruction::Unreachable:
   case Instruction::Fence:
   case Instruction::AtomicRMW:
   case Instruction::AtomicCmpXchg:
   case Instruction::LandingPad:
   case Instruction::Resume:
   case Instruction::CatchSwitch:
   case Instruction::CatchPad:
   case Instruction::CatchRet:
   case Instruction::CleanupPad:
   case Instruction::CleanupRet:
     return false; // Misc instructions which have effects
   }
 }
 
 bool llvm::mayBeMemoryDependent(const Instruction &I) {
   return I.mayReadOrWriteMemory() || !isSafeToSpeculativelyExecute(&I);
 }
 
 OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
                                                    const Value *RHS,
                                                    const DataLayout &DL,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
   // Multiplying n * m significant bits yields a result of n + m significant
   // bits. If the total number of significant bits does not exceed the
   // result bit width (minus 1), there is no overflow.
   // This means if we have enough leading zero bits in the operands
   // we can guarantee that the result does not overflow.
   // Ref: "Hacker's Delight" by Henry Warren
   unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
   KnownBits LHSKnown(BitWidth);
   KnownBits RHSKnown(BitWidth);
   computeKnownBits(LHS, LHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
   computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT);
   // Note that underestimating the number of zero bits gives a more
   // conservative answer.
   unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
                       RHSKnown.countMinLeadingZeros();
   // First handle the easy case: if we have enough zero bits there's
   // definitely no overflow.
   if (ZeroBits >= BitWidth)
     return OverflowResult::NeverOverflows;
 
   // Get the largest possible values for each operand.
   APInt LHSMax = ~LHSKnown.Zero;
   APInt RHSMax = ~RHSKnown.Zero;
 
   // We know the multiply operation doesn't overflow if the maximum values for
   // each operand will not overflow after we multiply them together.
   bool MaxOverflow;
   (void)LHSMax.umul_ov(RHSMax, MaxOverflow);
   if (!MaxOverflow)
     return OverflowResult::NeverOverflows;
 
   // We know it always overflows if multiplying the smallest possible values for
   // the operands also results in overflow.
   bool MinOverflow;
   (void)LHSKnown.One.umul_ov(RHSKnown.One, MinOverflow);
   if (MinOverflow)
     return OverflowResult::AlwaysOverflows;
 
   return OverflowResult::MayOverflow;
 }
 
 OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
                                                    const Value *RHS,
                                                    const DataLayout &DL,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
   KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
   if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) {
     KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
 
     if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
       // The sign bit is set in both cases: this MUST overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::AlwaysOverflows;
     }
 
     if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
       // The sign bit is clear in both cases: this CANNOT overflow.
       // Create a simple add instruction, and insert it into the struct.
       return OverflowResult::NeverOverflows;
     }
   }
 
   return OverflowResult::MayOverflow;
 }
 
 /// \brief Return true if we can prove that adding the two values of the
 /// knownbits will not overflow.
 /// Otherwise return false.
 static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
                                     const KnownBits &RHSKnown) {
   // Addition of two 2's complement numbers having opposite signs will never
   // overflow.
   if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) ||
       (LHSKnown.isNonNegative() && RHSKnown.isNegative()))
     return true;
 
   // If either of the values is known to be non-negative, adding them can only
   // overflow if the second is also non-negative, so we can assume that.
   // Two non-negative numbers will only overflow if there is a carry to the 
   // sign bit, so we can check if even when the values are as big as possible
   // there is no overflow to the sign bit.
   if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) {
     APInt MaxLHS = ~LHSKnown.Zero;
     MaxLHS.clearSignBit();
     APInt MaxRHS = ~RHSKnown.Zero;
     MaxRHS.clearSignBit();
     APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
     return Result.isSignBitClear();
   }
 
   // If either of the values is known to be negative, adding them can only
   // overflow if the second is also negative, so we can assume that.
   // Two negative number will only overflow if there is no carry to the sign
   // bit, so we can check if even when the values are as small as possible
   // there is overflow to the sign bit.
   if (LHSKnown.isNegative() || RHSKnown.isNegative()) {
     APInt MinLHS = LHSKnown.One;
     MinLHS.clearSignBit();
     APInt MinRHS = RHSKnown.One;
     MinRHS.clearSignBit();
     APInt Result = std::move(MinLHS) + std::move(MinRHS);
     return Result.isSignBitSet();
   }
 
   // If we reached here it means that we know nothing about the sign bits.
   // In this case we can't know if there will be an overflow, since by 
   // changing the sign bits any two values can be made to overflow.
   return false;
 }
 
 static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
                                                   const Value *RHS,
                                                   const AddOperator *Add,
                                                   const DataLayout &DL,
                                                   AssumptionCache *AC,
                                                   const Instruction *CxtI,
                                                   const DominatorTree *DT) {
   if (Add && Add->hasNoSignedWrap()) {
     return OverflowResult::NeverOverflows;
   }
 
   // If LHS and RHS each have at least two sign bits, the addition will look
   // like
   //
   // XX..... +
   // YY.....
   //
   // If the carry into the most significant position is 0, X and Y can't both
   // be 1 and therefore the carry out of the addition is also 0.
   //
   // If the carry into the most significant position is 1, X and Y can't both
   // be 0 and therefore the carry out of the addition is also 1.
   //
   // Since the carry into the most significant position is always equal to
   // the carry out of the addition, there is no signed overflow.
   if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
       ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
     return OverflowResult::NeverOverflows;
 
   KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT);
   KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT);
 
   if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
     return OverflowResult::NeverOverflows;
 
   // The remaining code needs Add to be available. Early returns if not so.
   if (!Add)
     return OverflowResult::MayOverflow;
 
   // If the sign of Add is the same as at least one of the operands, this add
   // CANNOT overflow. This is particularly useful when the sum is
   // @llvm.assume'ed non-negative rather than proved so from analyzing its
   // operands.
   bool LHSOrRHSKnownNonNegative =
       (LHSKnown.isNonNegative() || RHSKnown.isNonNegative());
   bool LHSOrRHSKnownNegative = 
       (LHSKnown.isNegative() || RHSKnown.isNegative());
   if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) {
     KnownBits AddKnown = computeKnownBits(Add, DL, /*Depth=*/0, AC, CxtI, DT);
     if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) ||
         (AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
       return OverflowResult::NeverOverflows;
     }
   }
 
   return OverflowResult::MayOverflow;
 }
 
 bool llvm::isOverflowIntrinsicNoWrap(const IntrinsicInst *II,
                                      const DominatorTree &DT) {
 #ifndef NDEBUG
   auto IID = II->getIntrinsicID();
   assert((IID == Intrinsic::sadd_with_overflow ||
           IID == Intrinsic::uadd_with_overflow ||
           IID == Intrinsic::ssub_with_overflow ||
           IID == Intrinsic::usub_with_overflow ||
           IID == Intrinsic::smul_with_overflow ||
           IID == Intrinsic::umul_with_overflow) &&
          "Not an overflow intrinsic!");
 #endif
 
   SmallVector<const BranchInst *, 2> GuardingBranches;
   SmallVector<const ExtractValueInst *, 2> Results;
 
   for (const User *U : II->users()) {
     if (const auto *EVI = dyn_cast<ExtractValueInst>(U)) {
       assert(EVI->getNumIndices() == 1 && "Obvious from CI's type");
 
       if (EVI->getIndices()[0] == 0)
         Results.push_back(EVI);
       else {
         assert(EVI->getIndices()[0] == 1 && "Obvious from CI's type");
 
         for (const auto *U : EVI->users())
           if (const auto *B = dyn_cast<BranchInst>(U)) {
             assert(B->isConditional() && "How else is it using an i1?");
             GuardingBranches.push_back(B);
           }
       }
     } else {
       // We are using the aggregate directly in a way we don't want to analyze
       // here (storing it to a global, say).
       return false;
     }
   }
 
   auto AllUsesGuardedByBranch = [&](const BranchInst *BI) {
     BasicBlockEdge NoWrapEdge(BI->getParent(), BI->getSuccessor(1));
     if (!NoWrapEdge.isSingleEdge())
       return false;
 
     // Check if all users of the add are provably no-wrap.
     for (const auto *Result : Results) {
       // If the extractvalue itself is not executed on overflow, the we don't
       // need to check each use separately, since domination is transitive.
       if (DT.dominates(NoWrapEdge, Result->getParent()))
         continue;
 
       for (auto &RU : Result->uses())
         if (!DT.dominates(NoWrapEdge, RU))
           return false;
     }
 
     return true;
   };
 
   return llvm::any_of(GuardingBranches, AllUsesGuardedByBranch);
 }
 
 
 OverflowResult llvm::computeOverflowForSignedAdd(const AddOperator *Add,
                                                  const DataLayout &DL,
                                                  AssumptionCache *AC,
                                                  const Instruction *CxtI,
                                                  const DominatorTree *DT) {
   return ::computeOverflowForSignedAdd(Add->getOperand(0), Add->getOperand(1),
                                        Add, DL, AC, CxtI, DT);
 }
 
 OverflowResult llvm::computeOverflowForSignedAdd(const Value *LHS,
                                                  const Value *RHS,
                                                  const DataLayout &DL,
                                                  AssumptionCache *AC,
                                                  const Instruction *CxtI,
                                                  const DominatorTree *DT) {
   return ::computeOverflowForSignedAdd(LHS, RHS, nullptr, DL, AC, CxtI, DT);
 }
 
 bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
   // A memory operation returns normally if it isn't volatile. A volatile
   // operation is allowed to trap.
   //
   // An atomic operation isn't guaranteed to return in a reasonable amount of
   // time because it's possible for another thread to interfere with it for an
   // arbitrary length of time, but programs aren't allowed to rely on that.
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return !LI->isVolatile();
   if (const StoreInst *SI = dyn_cast<StoreInst>(I))
     return !SI->isVolatile();
   if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
     return !CXI->isVolatile();
   if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
     return !RMWI->isVolatile();
   if (const MemIntrinsic *MII = dyn_cast<MemIntrinsic>(I))
     return !MII->isVolatile();
 
   // If there is no successor, then execution can't transfer to it.
   if (const auto *CRI = dyn_cast<CleanupReturnInst>(I))
     return !CRI->unwindsToCaller();
   if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I))
     return !CatchSwitch->unwindsToCaller();
   if (isa<ResumeInst>(I))
     return false;
   if (isa<ReturnInst>(I))
     return false;
   if (isa<UnreachableInst>(I))
     return false;
 
   // Calls can throw, or contain an infinite loop, or kill the process.
   if (auto CS = ImmutableCallSite(I)) {
     // Call sites that throw have implicit non-local control flow.
     if (!CS.doesNotThrow())
       return false;
 
     // Non-throwing call sites can loop infinitely, call exit/pthread_exit
     // etc. and thus not return.  However, LLVM already assumes that
     //
     //  - Thread exiting actions are modeled as writes to memory invisible to
     //    the program.
     //
     //  - Loops that don't have side effects (side effects are volatile/atomic
     //    stores and IO) always terminate (see http://llvm.org/PR965).
     //    Furthermore IO itself is also modeled as writes to memory invisible to
     //    the program.
     //
     // We rely on those assumptions here, and use the memory effects of the call
     // target as a proxy for checking that it always returns.
 
     // FIXME: This isn't aggressive enough; a call which only writes to a global
     // is guaranteed to return.
     return CS.onlyReadsMemory() || CS.onlyAccessesArgMemory() ||
            match(I, m_Intrinsic<Intrinsic::assume>()) ||
            match(I, m_Intrinsic<Intrinsic::sideeffect>());
   }
 
   // Other instructions return normally.
   return true;
 }
 
 bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
                                                   const Loop *L) {
   // The loop header is guaranteed to be executed for every iteration.
   //
   // FIXME: Relax this constraint to cover all basic blocks that are
   // guaranteed to be executed at every iteration.
   if (I->getParent() != L->getHeader()) return false;
 
   for (const Instruction &LI : *L->getHeader()) {
     if (&LI == I) return true;
     if (!isGuaranteedToTransferExecutionToSuccessor(&LI)) return false;
   }
   llvm_unreachable("Instruction not contained in its own parent basic block.");
 }
 
 bool llvm::propagatesFullPoison(const Instruction *I) {
   switch (I->getOpcode()) {
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Xor:
   case Instruction::Trunc:
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
   case Instruction::Mul:
   case Instruction::Shl:
   case Instruction::GetElementPtr:
     // These operations all propagate poison unconditionally. Note that poison
     // is not any particular value, so xor or subtraction of poison with
     // itself still yields poison, not zero.
     return true;
 
   case Instruction::AShr:
   case Instruction::SExt:
     // For these operations, one bit of the input is replicated across
     // multiple output bits. A replicated poison bit is still poison.
     return true;
 
   case Instruction::ICmp:
     // Comparing poison with any value yields poison.  This is why, for
     // instance, x s< (x +nsw 1) can be folded to true.
     return true;
 
   default:
     return false;
   }
 }
 
 const Value *llvm::getGuaranteedNonFullPoisonOp(const Instruction *I) {
   switch (I->getOpcode()) {
     case Instruction::Store:
       return cast<StoreInst>(I)->getPointerOperand();
 
     case Instruction::Load:
       return cast<LoadInst>(I)->getPointerOperand();
 
     case Instruction::AtomicCmpXchg:
       return cast<AtomicCmpXchgInst>(I)->getPointerOperand();
 
     case Instruction::AtomicRMW:
       return cast<AtomicRMWInst>(I)->getPointerOperand();
 
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::URem:
     case Instruction::SRem:
       return I->getOperand(1);
 
     default:
       return nullptr;
   }
 }
 
 bool llvm::programUndefinedIfFullPoison(const Instruction *PoisonI) {
   // We currently only look for uses of poison values within the same basic
   // block, as that makes it easier to guarantee that the uses will be
   // executed given that PoisonI is executed.
   //
   // FIXME: Expand this to consider uses beyond the same basic block. To do
   // this, look out for the distinction between post-dominance and strong
   // post-dominance.
   const BasicBlock *BB = PoisonI->getParent();
 
   // Set of instructions that we have proved will yield poison if PoisonI
   // does.
   SmallSet<const Value *, 16> YieldsPoison;
   SmallSet<const BasicBlock *, 4> Visited;
   YieldsPoison.insert(PoisonI);
   Visited.insert(PoisonI->getParent());
 
   BasicBlock::const_iterator Begin = PoisonI->getIterator(), End = BB->end();
 
   unsigned Iter = 0;
   while (Iter++ < MaxDepth) {
     for (auto &I : make_range(Begin, End)) {
       if (&I != PoisonI) {
         const Value *NotPoison = getGuaranteedNonFullPoisonOp(&I);
         if (NotPoison != nullptr && YieldsPoison.count(NotPoison))
           return true;
         if (!isGuaranteedToTransferExecutionToSuccessor(&I))
           return false;
       }
 
       // Mark poison that propagates from I through uses of I.
       if (YieldsPoison.count(&I)) {
         for (const User *User : I.users()) {
           const Instruction *UserI = cast<Instruction>(User);
           if (propagatesFullPoison(UserI))
             YieldsPoison.insert(User);
         }
       }
     }
 
     if (auto *NextBB = BB->getSingleSuccessor()) {
       if (Visited.insert(NextBB).second) {
         BB = NextBB;
         Begin = BB->getFirstNonPHI()->getIterator();
         End = BB->end();
         continue;
       }
     }
 
     break;
   }
   return false;
 }
 
 static bool isKnownNonNaN(const Value *V, FastMathFlags FMF) {
   if (FMF.noNaNs())
     return true;
 
   if (auto *C = dyn_cast<ConstantFP>(V))
     return !C->isNaN();
   return false;
 }
 
 static bool isKnownNonZero(const Value *V) {
   if (auto *C = dyn_cast<ConstantFP>(V))
     return !C->isZero();
   return false;
 }
 
 /// Match clamp pattern for float types without care about NaNs or signed zeros.
 /// Given non-min/max outer cmp/select from the clamp pattern this
 /// function recognizes if it can be substitued by a "canonical" min/max
 /// pattern.
 static SelectPatternResult matchFastFloatClamp(CmpInst::Predicate Pred,
                                                Value *CmpLHS, Value *CmpRHS,
                                                Value *TrueVal, Value *FalseVal,
                                                Value *&LHS, Value *&RHS) {
   // Try to match
   //   X < C1 ? C1 : Min(X, C2) --> Max(C1, Min(X, C2))
   //   X > C1 ? C1 : Max(X, C2) --> Min(C1, Max(X, C2))
   // and return description of the outer Max/Min.
 
   // First, check if select has inverse order:
   if (CmpRHS == FalseVal) {
     std::swap(TrueVal, FalseVal);
     Pred = CmpInst::getInversePredicate(Pred);
   }
 
   // Assume success now. If there's no match, callers should not use these anyway.
   LHS = TrueVal;
   RHS = FalseVal;
 
   const APFloat *FC1;
   if (CmpRHS != TrueVal || !match(CmpRHS, m_APFloat(FC1)) || !FC1->isFinite())
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   const APFloat *FC2;
   switch (Pred) {
   case CmpInst::FCMP_OLT:
   case CmpInst::FCMP_OLE:
   case CmpInst::FCMP_ULT:
   case CmpInst::FCMP_ULE:
     if (match(FalseVal,
               m_CombineOr(m_OrdFMin(m_Specific(CmpLHS), m_APFloat(FC2)),
                           m_UnordFMin(m_Specific(CmpLHS), m_APFloat(FC2)))) &&
         FC1->compare(*FC2) == APFloat::cmpResult::cmpLessThan)
       return {SPF_FMAXNUM, SPNB_RETURNS_ANY, false};
     break;
   case CmpInst::FCMP_OGT:
   case CmpInst::FCMP_OGE:
   case CmpInst::FCMP_UGT:
   case CmpInst::FCMP_UGE:
     if (match(FalseVal,
               m_CombineOr(m_OrdFMax(m_Specific(CmpLHS), m_APFloat(FC2)),
                           m_UnordFMax(m_Specific(CmpLHS), m_APFloat(FC2)))) &&
         FC1->compare(*FC2) == APFloat::cmpResult::cmpGreaterThan)
       return {SPF_FMINNUM, SPNB_RETURNS_ANY, false};
     break;
   default:
     break;
   }
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
 /// Recognize variations of:
 ///   CLAMP(v,l,h) ==> ((v) < (l) ? (l) : ((v) > (h) ? (h) : (v)))
 static SelectPatternResult matchClamp(CmpInst::Predicate Pred,
                                       Value *CmpLHS, Value *CmpRHS,
                                       Value *TrueVal, Value *FalseVal) {
   // Swap the select operands and predicate to match the patterns below.
   if (CmpRHS != TrueVal) {
     Pred = ICmpInst::getSwappedPredicate(Pred);
     std::swap(TrueVal, FalseVal);
   }
   const APInt *C1;
   if (CmpRHS == TrueVal && match(CmpRHS, m_APInt(C1))) {
     const APInt *C2;
     // (X <s C1) ? C1 : SMIN(X, C2) ==> SMAX(SMIN(X, C2), C1)
     if (match(FalseVal, m_SMin(m_Specific(CmpLHS), m_APInt(C2))) &&
         C1->slt(*C2) && Pred == CmpInst::ICMP_SLT)
       return {SPF_SMAX, SPNB_NA, false};
 
     // (X >s C1) ? C1 : SMAX(X, C2) ==> SMIN(SMAX(X, C2), C1)
     if (match(FalseVal, m_SMax(m_Specific(CmpLHS), m_APInt(C2))) &&
         C1->sgt(*C2) && Pred == CmpInst::ICMP_SGT)
       return {SPF_SMIN, SPNB_NA, false};
 
     // (X <u C1) ? C1 : UMIN(X, C2) ==> UMAX(UMIN(X, C2), C1)
     if (match(FalseVal, m_UMin(m_Specific(CmpLHS), m_APInt(C2))) &&
         C1->ult(*C2) && Pred == CmpInst::ICMP_ULT)
       return {SPF_UMAX, SPNB_NA, false};
 
     // (X >u C1) ? C1 : UMAX(X, C2) ==> UMIN(UMAX(X, C2), C1)
     if (match(FalseVal, m_UMax(m_Specific(CmpLHS), m_APInt(C2))) &&
         C1->ugt(*C2) && Pred == CmpInst::ICMP_UGT)
       return {SPF_UMIN, SPNB_NA, false};
   }
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
 /// Recognize variations of:
 ///   a < c ? min(a,b) : min(b,c) ==> min(min(a,b),min(b,c))
 static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
                                                Value *CmpLHS, Value *CmpRHS,
-                                               Value *TrueVal, Value *FalseVal) {
+                                               Value *TVal, Value *FVal,
+                                               unsigned Depth) {
   // TODO: Allow FP min/max with nnan/nsz.
   assert(CmpInst::isIntPredicate(Pred) && "Expected integer comparison");
 
   Value *A, *B;
-  SelectPatternResult L = matchSelectPattern(TrueVal, A, B);
+  SelectPatternResult L = matchSelectPattern(TVal, A, B, nullptr, Depth + 1);
   if (!SelectPatternResult::isMinOrMax(L.Flavor))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   Value *C, *D;
-  SelectPatternResult R = matchSelectPattern(FalseVal, C, D);
+  SelectPatternResult R = matchSelectPattern(FVal, C, D, nullptr, Depth + 1);
   if (L.Flavor != R.Flavor)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   // Match the compare to the min/max operations of the select operands.
   switch (L.Flavor) {
   case SPF_SMIN:
     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE) {
       Pred = ICmpInst::getSwappedPredicate(Pred);
       std::swap(CmpLHS, CmpRHS);
     }
     if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
       break;
     return {SPF_UNKNOWN, SPNB_NA, false};
   case SPF_SMAX:
     if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE) {
       Pred = ICmpInst::getSwappedPredicate(Pred);
       std::swap(CmpLHS, CmpRHS);
     }
     if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
       break;
     return {SPF_UNKNOWN, SPNB_NA, false};
   case SPF_UMIN:
     if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) {
       Pred = ICmpInst::getSwappedPredicate(Pred);
       std::swap(CmpLHS, CmpRHS);
     }
     if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE)
       break;
     return {SPF_UNKNOWN, SPNB_NA, false};
   case SPF_UMAX:
     if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
       Pred = ICmpInst::getSwappedPredicate(Pred);
       std::swap(CmpLHS, CmpRHS);
     }
     if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
       break;
     return {SPF_UNKNOWN, SPNB_NA, false};
   default:
     return {SPF_UNKNOWN, SPNB_NA, false};
   }
 
   // a pred c ? m(a, b) : m(c, b) --> m(m(a, b), m(c, b))
   if (CmpLHS == A && CmpRHS == C && D == B)
     return {L.Flavor, SPNB_NA, false};
 
   // a pred d ? m(a, b) : m(b, d) --> m(m(a, b), m(b, d))
   if (CmpLHS == A && CmpRHS == D && C == B)
     return {L.Flavor, SPNB_NA, false};
 
   // b pred c ? m(a, b) : m(c, a) --> m(m(a, b), m(c, a))
   if (CmpLHS == B && CmpRHS == C && D == A)
     return {L.Flavor, SPNB_NA, false};
 
   // b pred d ? m(a, b) : m(a, d) --> m(m(a, b), m(a, d))
   if (CmpLHS == B && CmpRHS == D && C == A)
     return {L.Flavor, SPNB_NA, false};
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
 /// Match non-obvious integer minimum and maximum sequences.
 static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
                                        Value *CmpLHS, Value *CmpRHS,
                                        Value *TrueVal, Value *FalseVal,
-                                       Value *&LHS, Value *&RHS) {
+                                       Value *&LHS, Value *&RHS,
+                                       unsigned Depth) {
   // Assume success. If there's no match, callers should not use these anyway.
   LHS = TrueVal;
   RHS = FalseVal;
 
   SelectPatternResult SPR = matchClamp(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal);
   if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN)
     return SPR;
 
-  SPR = matchMinMaxOfMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal);
+  SPR = matchMinMaxOfMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, Depth);
   if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN)
     return SPR;
   
   if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   // Z = X -nsw Y
   // (X >s Y) ? 0 : Z ==> (Z >s 0) ? 0 : Z ==> SMIN(Z, 0)
   // (X <s Y) ? 0 : Z ==> (Z <s 0) ? 0 : Z ==> SMAX(Z, 0)
   if (match(TrueVal, m_Zero()) &&
       match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
 
   // Z = X -nsw Y
   // (X >s Y) ? Z : 0 ==> (Z >s 0) ? Z : 0 ==> SMAX(Z, 0)
   // (X <s Y) ? Z : 0 ==> (Z <s 0) ? Z : 0 ==> SMIN(Z, 0)
   if (match(FalseVal, m_Zero()) &&
       match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
 
   const APInt *C1;
   if (!match(CmpRHS, m_APInt(C1)))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   // An unsigned min/max can be written with a signed compare.
   const APInt *C2;
   if ((CmpLHS == TrueVal && match(FalseVal, m_APInt(C2))) ||
       (CmpLHS == FalseVal && match(TrueVal, m_APInt(C2)))) {
     // Is the sign bit set?
     // (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
     // (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
     if (Pred == CmpInst::ICMP_SLT && C1->isNullValue() &&
         C2->isMaxSignedValue())
       return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
 
     // Is the sign bit clear?
     // (X >s -1) ? MINVAL : X ==> (X <u MINVAL) ? MINVAL : X ==> UMAX
     // (X >s -1) ? X : MINVAL ==> (X <u MINVAL) ? X : MINVAL ==> UMIN
     if (Pred == CmpInst::ICMP_SGT && C1->isAllOnesValue() &&
         C2->isMinSignedValue())
       return {CmpLHS == FalseVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
   }
 
   // Look through 'not' ops to find disguised signed min/max.
   // (X >s C) ? ~X : ~C ==> (~X <s ~C) ? ~X : ~C ==> SMIN(~X, ~C)
   // (X <s C) ? ~X : ~C ==> (~X >s ~C) ? ~X : ~C ==> SMAX(~X, ~C)
   if (match(TrueVal, m_Not(m_Specific(CmpLHS))) &&
       match(FalseVal, m_APInt(C2)) && ~(*C1) == *C2)
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
 
   // (X >s C) ? ~C : ~X ==> (~X <s ~C) ? ~C : ~X ==> SMAX(~C, ~X)
   // (X <s C) ? ~C : ~X ==> (~X >s ~C) ? ~C : ~X ==> SMIN(~C, ~X)
   if (match(FalseVal, m_Not(m_Specific(CmpLHS))) &&
       match(TrueVal, m_APInt(C2)) && ~(*C1) == *C2)
     return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
 
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
 static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
                                               FastMathFlags FMF,
                                               Value *CmpLHS, Value *CmpRHS,
                                               Value *TrueVal, Value *FalseVal,
-                                              Value *&LHS, Value *&RHS) {
+                                              Value *&LHS, Value *&RHS,
+                                              unsigned Depth) {
   LHS = CmpLHS;
   RHS = CmpRHS;
 
   // Signed zero may return inconsistent results between implementations.
   //  (0.0 <= -0.0) ? 0.0 : -0.0 // Returns 0.0
   //  minNum(0.0, -0.0)          // May return -0.0 or 0.0 (IEEE 754-2008 5.3.1)
   // Therefore, we behave conservatively and only proceed if at least one of the
   // operands is known to not be zero or if we don't care about signed zero.
   switch (Pred) {
   default: break;
   // FIXME: Include OGT/OLT/UGT/ULT.
   case CmpInst::FCMP_OGE: case CmpInst::FCMP_OLE:
   case CmpInst::FCMP_UGE: case CmpInst::FCMP_ULE:
     if (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) &&
         !isKnownNonZero(CmpRHS))
       return {SPF_UNKNOWN, SPNB_NA, false};
   }
 
   SelectPatternNaNBehavior NaNBehavior = SPNB_NA;
   bool Ordered = false;
 
   // When given one NaN and one non-NaN input:
   //   - maxnum/minnum (C99 fmaxf()/fminf()) return the non-NaN input.
   //   - A simple C99 (a < b ? a : b) construction will return 'b' (as the
   //     ordered comparison fails), which could be NaN or non-NaN.
   // so here we discover exactly what NaN behavior is required/accepted.
   if (CmpInst::isFPPredicate(Pred)) {
     bool LHSSafe = isKnownNonNaN(CmpLHS, FMF);
     bool RHSSafe = isKnownNonNaN(CmpRHS, FMF);
 
     if (LHSSafe && RHSSafe) {
       // Both operands are known non-NaN.
       NaNBehavior = SPNB_RETURNS_ANY;
     } else if (CmpInst::isOrdered(Pred)) {
       // An ordered comparison will return false when given a NaN, so it
       // returns the RHS.
       Ordered = true;
       if (LHSSafe)
         // LHS is non-NaN, so if RHS is NaN then NaN will be returned.
         NaNBehavior = SPNB_RETURNS_NAN;
       else if (RHSSafe)
         NaNBehavior = SPNB_RETURNS_OTHER;
       else
         // Completely unsafe.
         return {SPF_UNKNOWN, SPNB_NA, false};
     } else {
       Ordered = false;
       // An unordered comparison will return true when given a NaN, so it
       // returns the LHS.
       if (LHSSafe)
         // LHS is non-NaN, so if RHS is NaN then non-NaN will be returned.
         NaNBehavior = SPNB_RETURNS_OTHER;
       else if (RHSSafe)
         NaNBehavior = SPNB_RETURNS_NAN;
       else
         // Completely unsafe.
         return {SPF_UNKNOWN, SPNB_NA, false};
     }
   }
 
   if (TrueVal == CmpRHS && FalseVal == CmpLHS) {
     std::swap(CmpLHS, CmpRHS);
     Pred = CmpInst::getSwappedPredicate(Pred);
     if (NaNBehavior == SPNB_RETURNS_NAN)
       NaNBehavior = SPNB_RETURNS_OTHER;
     else if (NaNBehavior == SPNB_RETURNS_OTHER)
       NaNBehavior = SPNB_RETURNS_NAN;
     Ordered = !Ordered;
   }
 
   // ([if]cmp X, Y) ? X : Y
   if (TrueVal == CmpLHS && FalseVal == CmpRHS) {
     switch (Pred) {
     default: return {SPF_UNKNOWN, SPNB_NA, false}; // Equality.
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE: return {SPF_UMAX, SPNB_NA, false};
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE: return {SPF_SMAX, SPNB_NA, false};
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE: return {SPF_UMIN, SPNB_NA, false};
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE: return {SPF_SMIN, SPNB_NA, false};
     case FCmpInst::FCMP_UGT:
     case FCmpInst::FCMP_UGE:
     case FCmpInst::FCMP_OGT:
     case FCmpInst::FCMP_OGE: return {SPF_FMAXNUM, NaNBehavior, Ordered};
     case FCmpInst::FCMP_ULT:
     case FCmpInst::FCMP_ULE:
     case FCmpInst::FCMP_OLT:
     case FCmpInst::FCMP_OLE: return {SPF_FMINNUM, NaNBehavior, Ordered};
     }
   }
 
   const APInt *C1;
   if (match(CmpRHS, m_APInt(C1))) {
     if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) ||
         (CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) {
 
       // ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
       // NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
       if (Pred == ICmpInst::ICMP_SGT &&
           (C1->isNullValue() || C1->isAllOnesValue())) {
         return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
       }
 
       // ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
       // NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
       if (Pred == ICmpInst::ICMP_SLT &&
           (C1->isNullValue() || C1->isOneValue())) {
         return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
       }
     }
   }
 
   if (CmpInst::isIntPredicate(Pred))
-    return matchMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS);
+    return matchMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS, Depth);
 
   // According to (IEEE 754-2008 5.3.1), minNum(0.0, -0.0) and similar
   // may return either -0.0 or 0.0, so fcmp/select pair has stricter
   // semantics than minNum. Be conservative in such case.
   if (NaNBehavior != SPNB_RETURNS_ANY ||
       (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) &&
        !isKnownNonZero(CmpRHS)))
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   return matchFastFloatClamp(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS);
 }
 
 /// Helps to match a select pattern in case of a type mismatch.
 ///
 /// The function processes the case when type of true and false values of a
 /// select instruction differs from type of the cmp instruction operands because
 /// of a cast instructon. The function checks if it is legal to move the cast
 /// operation after "select". If yes, it returns the new second value of
 /// "select" (with the assumption that cast is moved):
 /// 1. As operand of cast instruction when both values of "select" are same cast
 /// instructions.
 /// 2. As restored constant (by applying reverse cast operation) when the first
 /// value of the "select" is a cast operation and the second value is a
 /// constant.
 /// NOTE: We return only the new second value because the first value could be
 /// accessed as operand of cast instruction.
 static Value *lookThroughCast(CmpInst *CmpI, Value *V1, Value *V2,
                               Instruction::CastOps *CastOp) {
   auto *Cast1 = dyn_cast<CastInst>(V1);
   if (!Cast1)
     return nullptr;
 
   *CastOp = Cast1->getOpcode();
   Type *SrcTy = Cast1->getSrcTy();
   if (auto *Cast2 = dyn_cast<CastInst>(V2)) {
     // If V1 and V2 are both the same cast from the same type, look through V1.
     if (*CastOp == Cast2->getOpcode() && SrcTy == Cast2->getSrcTy())
       return Cast2->getOperand(0);
     return nullptr;
   }
 
   auto *C = dyn_cast<Constant>(V2);
   if (!C)
     return nullptr;
 
   Constant *CastedTo = nullptr;
   switch (*CastOp) {
   case Instruction::ZExt:
     if (CmpI->isUnsigned())
       CastedTo = ConstantExpr::getTrunc(C, SrcTy);
     break;
   case Instruction::SExt:
     if (CmpI->isSigned())
       CastedTo = ConstantExpr::getTrunc(C, SrcTy, true);
     break;
   case Instruction::Trunc:
     Constant *CmpConst;
     if (match(CmpI->getOperand(1), m_Constant(CmpConst)) &&
         CmpConst->getType() == SrcTy) {
       // Here we have the following case:
       //
       //   %cond = cmp iN %x, CmpConst
       //   %tr = trunc iN %x to iK
       //   %narrowsel = select i1 %cond, iK %t, iK C
       //
       // We can always move trunc after select operation:
       //
       //   %cond = cmp iN %x, CmpConst
       //   %widesel = select i1 %cond, iN %x, iN CmpConst
       //   %tr = trunc iN %widesel to iK
       //
       // Note that C could be extended in any way because we don't care about
       // upper bits after truncation. It can't be abs pattern, because it would
       // look like:
       //
       //   select i1 %cond, x, -x.
       //
       // So only min/max pattern could be matched. Such match requires widened C
       // == CmpConst. That is why set widened C = CmpConst, condition trunc
       // CmpConst == C is checked below.
       CastedTo = CmpConst;
     } else {
       CastedTo = ConstantExpr::getIntegerCast(C, SrcTy, CmpI->isSigned());
     }
     break;
   case Instruction::FPTrunc:
     CastedTo = ConstantExpr::getFPExtend(C, SrcTy, true);
     break;
   case Instruction::FPExt:
     CastedTo = ConstantExpr::getFPTrunc(C, SrcTy, true);
     break;
   case Instruction::FPToUI:
     CastedTo = ConstantExpr::getUIToFP(C, SrcTy, true);
     break;
   case Instruction::FPToSI:
     CastedTo = ConstantExpr::getSIToFP(C, SrcTy, true);
     break;
   case Instruction::UIToFP:
     CastedTo = ConstantExpr::getFPToUI(C, SrcTy, true);
     break;
   case Instruction::SIToFP:
     CastedTo = ConstantExpr::getFPToSI(C, SrcTy, true);
     break;
   default:
     break;
   }
 
   if (!CastedTo)
     return nullptr;
 
   // Make sure the cast doesn't lose any information.
   Constant *CastedBack =
       ConstantExpr::getCast(*CastOp, CastedTo, C->getType(), true);
   if (CastedBack != C)
     return nullptr;
 
   return CastedTo;
 }
 
 SelectPatternResult llvm::matchSelectPattern(Value *V, Value *&LHS, Value *&RHS,
-                                             Instruction::CastOps *CastOp) {
+                                             Instruction::CastOps *CastOp,
+                                             unsigned Depth) {
+  if (Depth >= MaxDepth)
+    return {SPF_UNKNOWN, SPNB_NA, false};
+
   SelectInst *SI = dyn_cast<SelectInst>(V);
   if (!SI) return {SPF_UNKNOWN, SPNB_NA, false};
 
   CmpInst *CmpI = dyn_cast<CmpInst>(SI->getCondition());
   if (!CmpI) return {SPF_UNKNOWN, SPNB_NA, false};
 
   CmpInst::Predicate Pred = CmpI->getPredicate();
   Value *CmpLHS = CmpI->getOperand(0);
   Value *CmpRHS = CmpI->getOperand(1);
   Value *TrueVal = SI->getTrueValue();
   Value *FalseVal = SI->getFalseValue();
   FastMathFlags FMF;
   if (isa<FPMathOperator>(CmpI))
     FMF = CmpI->getFastMathFlags();
 
   // Bail out early.
   if (CmpI->isEquality())
     return {SPF_UNKNOWN, SPNB_NA, false};
 
   // Deal with type mismatches.
   if (CastOp && CmpLHS->getType() != TrueVal->getType()) {
     if (Value *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp)) {
       // If this is a potential fmin/fmax with a cast to integer, then ignore
       // -0.0 because there is no corresponding integer value.
       if (*CastOp == Instruction::FPToSI || *CastOp == Instruction::FPToUI)
         FMF.setNoSignedZeros();
       return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
                                   cast<CastInst>(TrueVal)->getOperand(0), C,
-                                  LHS, RHS);
+                                  LHS, RHS, Depth);
     }
     if (Value *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp)) {
       // If this is a potential fmin/fmax with a cast to integer, then ignore
       // -0.0 because there is no corresponding integer value.
       if (*CastOp == Instruction::FPToSI || *CastOp == Instruction::FPToUI)
         FMF.setNoSignedZeros();
       return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
                                   C, cast<CastInst>(FalseVal)->getOperand(0),
-                                  LHS, RHS);
+                                  LHS, RHS, Depth);
     }
   }
   return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, TrueVal, FalseVal,
-                              LHS, RHS);
+                              LHS, RHS, Depth);
 }
 
 /// Return true if "icmp Pred LHS RHS" is always true.
 static bool isTruePredicate(CmpInst::Predicate Pred, const Value *LHS,
                             const Value *RHS, const DataLayout &DL,
                             unsigned Depth) {
   assert(!LHS->getType()->isVectorTy() && "TODO: extend to handle vectors!");
   if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS)
     return true;
 
   switch (Pred) {
   default:
     return false;
 
   case CmpInst::ICMP_SLE: {
     const APInt *C;
 
     // LHS s<= LHS +_{nsw} C   if C >= 0
     if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))))
       return !C->isNegative();
     return false;
   }
 
   case CmpInst::ICMP_ULE: {
     const APInt *C;
 
     // LHS u<= LHS +_{nuw} C   for any C
     if (match(RHS, m_NUWAdd(m_Specific(LHS), m_APInt(C))))
       return true;
 
     // Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
     auto MatchNUWAddsToSameValue = [&](const Value *A, const Value *B,
                                        const Value *&X,
                                        const APInt *&CA, const APInt *&CB) {
       if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) &&
           match(B, m_NUWAdd(m_Specific(X), m_APInt(CB))))
         return true;
 
       // If X & C == 0 then (X | C) == X +_{nuw} C
       if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
           match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
         KnownBits Known(CA->getBitWidth());
         computeKnownBits(X, Known, DL, Depth + 1, /*AC*/ nullptr,
                          /*CxtI*/ nullptr, /*DT*/ nullptr);
         if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero))
           return true;
       }
 
       return false;
     };
 
     const Value *X;
     const APInt *CLHS, *CRHS;
     if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS))
       return CLHS->ule(*CRHS);
 
     return false;
   }
   }
 }
 
 /// Return true if "icmp Pred BLHS BRHS" is true whenever "icmp Pred
 /// ALHS ARHS" is true.  Otherwise, return None.
 static Optional<bool>
 isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
                       const Value *ARHS, const Value *BLHS, const Value *BRHS,
                       const DataLayout &DL, unsigned Depth) {
   switch (Pred) {
   default:
     return None;
 
   case CmpInst::ICMP_SLT:
   case CmpInst::ICMP_SLE:
     if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth) &&
         isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth))
       return true;
     return None;
 
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
     if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth) &&
         isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth))
       return true;
     return None;
   }
 }
 
 /// Return true if the operands of the two compares match.  IsSwappedOps is true
 /// when the operands match, but are swapped.
 static bool isMatchingOps(const Value *ALHS, const Value *ARHS,
                           const Value *BLHS, const Value *BRHS,
                           bool &IsSwappedOps) {
 
   bool IsMatchingOps = (ALHS == BLHS && ARHS == BRHS);
   IsSwappedOps = (ALHS == BRHS && ARHS == BLHS);
   return IsMatchingOps || IsSwappedOps;
 }
 
 /// Return true if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS BRHS" is
 /// true.  Return false if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS
 /// BRHS" is false.  Otherwise, return None if we can't infer anything.
 static Optional<bool> isImpliedCondMatchingOperands(CmpInst::Predicate APred,
                                                     const Value *ALHS,
                                                     const Value *ARHS,
                                                     CmpInst::Predicate BPred,
                                                     const Value *BLHS,
                                                     const Value *BRHS,
                                                     bool IsSwappedOps) {
   // Canonicalize the operands so they're matching.
   if (IsSwappedOps) {
     std::swap(BLHS, BRHS);
     BPred = ICmpInst::getSwappedPredicate(BPred);
   }
   if (CmpInst::isImpliedTrueByMatchingCmp(APred, BPred))
     return true;
   if (CmpInst::isImpliedFalseByMatchingCmp(APred, BPred))
     return false;
 
   return None;
 }
 
 /// Return true if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS C2" is
 /// true.  Return false if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS
 /// C2" is false.  Otherwise, return None if we can't infer anything.
 static Optional<bool>
 isImpliedCondMatchingImmOperands(CmpInst::Predicate APred, const Value *ALHS,
                                  const ConstantInt *C1,
                                  CmpInst::Predicate BPred,
                                  const Value *BLHS, const ConstantInt *C2) {
   assert(ALHS == BLHS && "LHS operands must match.");
   ConstantRange DomCR =
       ConstantRange::makeExactICmpRegion(APred, C1->getValue());
   ConstantRange CR =
       ConstantRange::makeAllowedICmpRegion(BPred, C2->getValue());
   ConstantRange Intersection = DomCR.intersectWith(CR);
   ConstantRange Difference = DomCR.difference(CR);
   if (Intersection.isEmptySet())
     return false;
   if (Difference.isEmptySet())
     return true;
   return None;
 }
 
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return None if we can't infer anything.
 static Optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
                                          const ICmpInst *RHS,
                                          const DataLayout &DL, bool LHSIsTrue,
                                          unsigned Depth) {
   Value *ALHS = LHS->getOperand(0);
   Value *ARHS = LHS->getOperand(1);
   // The rest of the logic assumes the LHS condition is true.  If that's not the
   // case, invert the predicate to make it so.
   ICmpInst::Predicate APred =
       LHSIsTrue ? LHS->getPredicate() : LHS->getInversePredicate();
 
   Value *BLHS = RHS->getOperand(0);
   Value *BRHS = RHS->getOperand(1);
   ICmpInst::Predicate BPred = RHS->getPredicate();
 
   // Can we infer anything when the two compares have matching operands?
   bool IsSwappedOps;
   if (isMatchingOps(ALHS, ARHS, BLHS, BRHS, IsSwappedOps)) {
     if (Optional<bool> Implication = isImpliedCondMatchingOperands(
             APred, ALHS, ARHS, BPred, BLHS, BRHS, IsSwappedOps))
       return Implication;
     // No amount of additional analysis will infer the second condition, so
     // early exit.
     return None;
   }
 
   // Can we infer anything when the LHS operands match and the RHS operands are
   // constants (not necessarily matching)?
   if (ALHS == BLHS && isa<ConstantInt>(ARHS) && isa<ConstantInt>(BRHS)) {
     if (Optional<bool> Implication = isImpliedCondMatchingImmOperands(
             APred, ALHS, cast<ConstantInt>(ARHS), BPred, BLHS,
             cast<ConstantInt>(BRHS)))
       return Implication;
     // No amount of additional analysis will infer the second condition, so
     // early exit.
     return None;
   }
 
   if (APred == BPred)
     return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth);
   return None;
 }
 
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return None if we can't infer anything.  We expect the
 /// RHS to be an icmp and the LHS to be an 'and' or an 'or' instruction.
 static Optional<bool> isImpliedCondAndOr(const BinaryOperator *LHS,
                                          const ICmpInst *RHS,
                                          const DataLayout &DL, bool LHSIsTrue,
                                          unsigned Depth) {
   // The LHS must be an 'or' or an 'and' instruction.
   assert((LHS->getOpcode() == Instruction::And ||
           LHS->getOpcode() == Instruction::Or) &&
          "Expected LHS to be 'and' or 'or'.");
 
   assert(Depth <= MaxDepth && "Hit recursion limit");
 
   // If the result of an 'or' is false, then we know both legs of the 'or' are
   // false.  Similarly, if the result of an 'and' is true, then we know both
   // legs of the 'and' are true.
   Value *ALHS, *ARHS;
   if ((!LHSIsTrue && match(LHS, m_Or(m_Value(ALHS), m_Value(ARHS)))) ||
       (LHSIsTrue && match(LHS, m_And(m_Value(ALHS), m_Value(ARHS))))) {
     // FIXME: Make this non-recursion.
     if (Optional<bool> Implication =
             isImpliedCondition(ALHS, RHS, DL, LHSIsTrue, Depth + 1))
       return Implication;
     if (Optional<bool> Implication =
             isImpliedCondition(ARHS, RHS, DL, LHSIsTrue, Depth + 1))
       return Implication;
     return None;
   }
   return None;
 }
 
 Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
                                         const DataLayout &DL, bool LHSIsTrue,
                                         unsigned Depth) {
   // Bail out when we hit the limit.
   if (Depth == MaxDepth)
     return None;
 
   // A mismatch occurs when we compare a scalar cmp to a vector cmp, for
   // example.
   if (LHS->getType() != RHS->getType())
     return None;
 
   Type *OpTy = LHS->getType();
   assert(OpTy->isIntOrIntVectorTy(1) && "Expected integer type only!");
 
   // LHS ==> RHS by definition
   if (LHS == RHS)
     return LHSIsTrue;
 
   // FIXME: Extending the code below to handle vectors.
   if (OpTy->isVectorTy())
     return None;
 
   assert(OpTy->isIntegerTy(1) && "implied by above");
 
   // Both LHS and RHS are icmps.
   const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS);
   const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS);
   if (LHSCmp && RHSCmp)
     return isImpliedCondICmps(LHSCmp, RHSCmp, DL, LHSIsTrue, Depth);
 
   // The LHS should be an 'or' or an 'and' instruction.  We expect the RHS to be
   // an icmp. FIXME: Add support for and/or on the RHS.
   const BinaryOperator *LHSBO = dyn_cast<BinaryOperator>(LHS);
   if (LHSBO && RHSCmp) {
     if ((LHSBO->getOpcode() == Instruction::And ||
          LHSBO->getOpcode() == Instruction::Or))
       return isImpliedCondAndOr(LHSBO, RHSCmp, DL, LHSIsTrue, Depth);
   }
   return None;
 }
Index: head/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
===================================================================
--- head/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp	(revision 328752)
+++ head/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp	(revision 328753)
@@ -1,1358 +1,1362 @@
 //===- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator ---*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 /// \file
 /// This file implements the IRTranslator class.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <string>
 #include <utility>
 #include <vector>
 
 #define DEBUG_TYPE "irtranslator"
 
 using namespace llvm;
 
 char IRTranslator::ID = 0;
 
 INITIALIZE_PASS_BEGIN(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 
 static void reportTranslationError(MachineFunction &MF,
                                    const TargetPassConfig &TPC,
                                    OptimizationRemarkEmitter &ORE,
                                    OptimizationRemarkMissed &R) {
   MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
 
   // Print the function name explicitly if we don't have a debug location (which
   // makes the diagnostic less useful) or if we're going to emit a raw error.
   if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
     R << (" (in function: " + MF.getName() + ")").str();
 
   if (TPC.isGlobalISelAbortEnabled())
     report_fatal_error(R.getMsg());
   else
     ORE.emit(R);
 }
 
 IRTranslator::IRTranslator() : MachineFunctionPass(ID) {
   initializeIRTranslatorPass(*PassRegistry::getPassRegistry());
 }
 
 void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 unsigned IRTranslator::getOrCreateVReg(const Value &Val) {
   unsigned &ValReg = ValToVReg[&Val];
 
   if (ValReg)
     return ValReg;
 
   // Fill ValRegsSequence with the sequence of registers
   // we need to concat together to produce the value.
   assert(Val.getType()->isSized() &&
          "Don't know how to create an empty vreg");
   unsigned VReg =
       MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL));
   ValReg = VReg;
 
   if (auto CV = dyn_cast<Constant>(&Val)) {
     bool Success = translate(*CV, VReg);
     if (!Success) {
       OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                  MF->getFunction().getSubprogram(),
                                  &MF->getFunction().getEntryBlock());
       R << "unable to translate constant: " << ore::NV("Type", Val.getType());
       reportTranslationError(*MF, *TPC, *ORE, R);
       return VReg;
     }
   }
 
   return VReg;
 }
 
 int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) {
   if (FrameIndices.find(&AI) != FrameIndices.end())
     return FrameIndices[&AI];
 
   unsigned ElementSize = DL->getTypeStoreSize(AI.getAllocatedType());
   unsigned Size =
       ElementSize * cast<ConstantInt>(AI.getArraySize())->getZExtValue();
 
   // Always allocate at least one byte.
   Size = std::max(Size, 1u);
 
   unsigned Alignment = AI.getAlignment();
   if (!Alignment)
     Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
 
   int &FI = FrameIndices[&AI];
   FI = MF->getFrameInfo().CreateStackObject(Size, Alignment, false, &AI);
   return FI;
 }
 
 unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {
   unsigned Alignment = 0;
   Type *ValTy = nullptr;
   if (const StoreInst *SI = dyn_cast<StoreInst>(&I)) {
     Alignment = SI->getAlignment();
     ValTy = SI->getValueOperand()->getType();
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     Alignment = LI->getAlignment();
     ValTy = LI->getType();
   } else {
     OptimizationRemarkMissed R("gisel-irtranslator", "", &I);
     R << "unable to translate memop: " << ore::NV("Opcode", &I);
     reportTranslationError(*MF, *TPC, *ORE, R);
     return 1;
   }
 
   return Alignment ? Alignment : DL->getABITypeAlignment(ValTy);
 }
 
 MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) {
   MachineBasicBlock *&MBB = BBToMBB[&BB];
   assert(MBB && "BasicBlock was not encountered before");
   return *MBB;
 }
 
 void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
   assert(NewPred && "new predecessor must be a real MachineBasicBlock");
   MachinePreds[Edge].push_back(NewPred);
 }
 
 bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
                                      MachineIRBuilder &MIRBuilder) {
   // FIXME: handle signed/unsigned wrapping flags.
 
   // Get or create a virtual register for each value.
   // Unless the value is a Constant => loadimm cst?
   // or inline constant each time?
   // Creation of a virtual register needs to have a size.
   unsigned Op0 = getOrCreateVReg(*U.getOperand(0));
   unsigned Op1 = getOrCreateVReg(*U.getOperand(1));
   unsigned Res = getOrCreateVReg(U);
   MIRBuilder.buildInstr(Opcode).addDef(Res).addUse(Op0).addUse(Op1);
   return true;
 }
 
 bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
   // -0.0 - X --> G_FNEG
   if (isa<Constant>(U.getOperand(0)) &&
       U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) {
     MIRBuilder.buildInstr(TargetOpcode::G_FNEG)
         .addDef(getOrCreateVReg(U))
         .addUse(getOrCreateVReg(*U.getOperand(1)));
     return true;
   }
   return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
 }
 
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
   const CmpInst *CI = dyn_cast<CmpInst>(&U);
   unsigned Op0 = getOrCreateVReg(*U.getOperand(0));
   unsigned Op1 = getOrCreateVReg(*U.getOperand(1));
   unsigned Res = getOrCreateVReg(U);
   CmpInst::Predicate Pred =
       CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>(
                                     cast<ConstantExpr>(U).getPredicate());
   if (CmpInst::isIntPredicate(Pred))
     MIRBuilder.buildICmp(Pred, Res, Op0, Op1);
   else if (Pred == CmpInst::FCMP_FALSE)
     MIRBuilder.buildCopy(
         Res, getOrCreateVReg(*Constant::getNullValue(CI->getType())));
   else if (Pred == CmpInst::FCMP_TRUE)
     MIRBuilder.buildCopy(
         Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType())));
   else
     MIRBuilder.buildFCmp(Pred, Res, Op0, Op1);
 
   return true;
 }
 
 bool IRTranslator::translateRet(const User &U, MachineIRBuilder &MIRBuilder) {
   const ReturnInst &RI = cast<ReturnInst>(U);
   const Value *Ret = RI.getReturnValue();
   if (Ret && DL->getTypeStoreSize(Ret->getType()) == 0)
     Ret = nullptr;
   // The target may mess up with the insertion point, but
   // this is not important as a return is the last instruction
   // of the block anyway.
   return CLI->lowerReturn(MIRBuilder, Ret, !Ret ? 0 : getOrCreateVReg(*Ret));
 }
 
 bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   const BranchInst &BrInst = cast<BranchInst>(U);
   unsigned Succ = 0;
   if (!BrInst.isUnconditional()) {
     // We want a G_BRCOND to the true BB followed by an unconditional branch.
     unsigned Tst = getOrCreateVReg(*BrInst.getCondition());
     const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++));
     MachineBasicBlock &TrueBB = getMBB(TrueTgt);
     MIRBuilder.buildBrCond(Tst, TrueBB);
   }
 
   const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ));
   MachineBasicBlock &TgtBB = getMBB(BrTgt);
   MachineBasicBlock &CurBB = MIRBuilder.getMBB();
 
   // If the unconditional target is the layout successor, fallthrough.
   if (!CurBB.isLayoutSuccessor(&TgtBB))
     MIRBuilder.buildBr(TgtBB);
 
   // Link successors.
   for (const BasicBlock *Succ : BrInst.successors())
     CurBB.addSuccessor(&getMBB(*Succ));
   return true;
 }
 
 bool IRTranslator::translateSwitch(const User &U,
                                    MachineIRBuilder &MIRBuilder) {
   // For now, just translate as a chain of conditional branches.
   // FIXME: could we share most of the logic/code in
   // SelectionDAGBuilder::visitSwitch between SelectionDAG and GlobalISel?
   // At first sight, it seems most of the logic in there is independent of
   // SelectionDAG-specifics and a lot of work went in to optimize switch
   // lowering in there.
 
   const SwitchInst &SwInst = cast<SwitchInst>(U);
   const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition());
   const BasicBlock *OrigBB = SwInst.getParent();
 
   LLT LLTi1 = getLLTForType(*Type::getInt1Ty(U.getContext()), *DL);
   for (auto &CaseIt : SwInst.cases()) {
     const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue());
     const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1);
     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue);
     MachineBasicBlock &CurMBB = MIRBuilder.getMBB();
     const BasicBlock *TrueBB = CaseIt.getCaseSuccessor();
     MachineBasicBlock &TrueMBB = getMBB(*TrueBB);
 
     MIRBuilder.buildBrCond(Tst, TrueMBB);
     CurMBB.addSuccessor(&TrueMBB);
     addMachineCFGPred({OrigBB, TrueBB}, &CurMBB);
 
     MachineBasicBlock *FalseMBB =
         MF->CreateMachineBasicBlock(SwInst.getParent());
     // Insert the comparison blocks one after the other.
     MF->insert(std::next(CurMBB.getIterator()), FalseMBB);
     MIRBuilder.buildBr(*FalseMBB);
     CurMBB.addSuccessor(FalseMBB);
 
     MIRBuilder.setMBB(*FalseMBB);
   }
   // handle default case
   const BasicBlock *DefaultBB = SwInst.getDefaultDest();
   MachineBasicBlock &DefaultMBB = getMBB(*DefaultBB);
   MIRBuilder.buildBr(DefaultMBB);
   MachineBasicBlock &CurMBB = MIRBuilder.getMBB();
   CurMBB.addSuccessor(&DefaultMBB);
   addMachineCFGPred({OrigBB, DefaultBB}, &CurMBB);
 
   return true;
 }
 
 bool IRTranslator::translateIndirectBr(const User &U,
                                        MachineIRBuilder &MIRBuilder) {
   const IndirectBrInst &BrInst = cast<IndirectBrInst>(U);
 
   const unsigned Tgt = getOrCreateVReg(*BrInst.getAddress());
   MIRBuilder.buildBrIndirect(Tgt);
 
   // Link successors.
   MachineBasicBlock &CurBB = MIRBuilder.getMBB();
   for (const BasicBlock *Succ : BrInst.successors())
     CurBB.addSuccessor(&getMBB(*Succ));
 
   return true;
 }
 
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
   auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOLoad;
 
   if (DL->getTypeStoreSize(LI.getType()) == 0)
     return true;
 
   unsigned Res = getOrCreateVReg(LI);
   unsigned Addr = getOrCreateVReg(*LI.getPointerOperand());
 
   MIRBuilder.buildLoad(
       Res, Addr,
       *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()),
                                 Flags, DL->getTypeStoreSize(LI.getType()),
                                 getMemOpAlignment(LI), AAMDNodes(), nullptr,
                                 LI.getSyncScopeID(), LI.getOrdering()));
   return true;
 }
 
 bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   const StoreInst &SI = cast<StoreInst>(U);
   auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOStore;
 
   if (DL->getTypeStoreSize(SI.getValueOperand()->getType()) == 0)
     return true;
 
   unsigned Val = getOrCreateVReg(*SI.getValueOperand());
   unsigned Addr = getOrCreateVReg(*SI.getPointerOperand());
 
   MIRBuilder.buildStore(
       Val, Addr,
       *MF->getMachineMemOperand(
           MachinePointerInfo(SI.getPointerOperand()), Flags,
           DL->getTypeStoreSize(SI.getValueOperand()->getType()),
           getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSyncScopeID(),
           SI.getOrdering()));
   return true;
 }
 
 bool IRTranslator::translateExtractValue(const User &U,
                                          MachineIRBuilder &MIRBuilder) {
   const Value *Src = U.getOperand(0);
   Type *Int32Ty = Type::getInt32Ty(U.getContext());
   SmallVector<Value *, 1> Indices;
 
   // If Src is a single element ConstantStruct, translate extractvalue
   // to that element to avoid inserting a cast instruction.
   if (auto CS = dyn_cast<ConstantStruct>(Src))
     if (CS->getNumOperands() == 1) {
       unsigned Res = getOrCreateVReg(*CS->getOperand(0));
       ValToVReg[&U] = Res;
       return true;
     }
 
   // getIndexedOffsetInType is designed for GEPs, so the first index is the
   // usual array element rather than looking into the actual aggregate.
   Indices.push_back(ConstantInt::get(Int32Ty, 0));
 
   if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(&U)) {
     for (auto Idx : EVI->indices())
       Indices.push_back(ConstantInt::get(Int32Ty, Idx));
   } else {
     for (unsigned i = 1; i < U.getNumOperands(); ++i)
       Indices.push_back(U.getOperand(i));
   }
 
   uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);
 
   unsigned Res = getOrCreateVReg(U);
   MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset);
 
   return true;
 }
 
 bool IRTranslator::translateInsertValue(const User &U,
                                         MachineIRBuilder &MIRBuilder) {
   const Value *Src = U.getOperand(0);
   Type *Int32Ty = Type::getInt32Ty(U.getContext());
   SmallVector<Value *, 1> Indices;
 
   // getIndexedOffsetInType is designed for GEPs, so the first index is the
   // usual array element rather than looking into the actual aggregate.
   Indices.push_back(ConstantInt::get(Int32Ty, 0));
 
   if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(&U)) {
     for (auto Idx : IVI->indices())
       Indices.push_back(ConstantInt::get(Int32Ty, Idx));
   } else {
     for (unsigned i = 2; i < U.getNumOperands(); ++i)
       Indices.push_back(U.getOperand(i));
   }
 
   uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);
 
   unsigned Res = getOrCreateVReg(U);
   unsigned Inserted = getOrCreateVReg(*U.getOperand(1));
   MIRBuilder.buildInsert(Res, getOrCreateVReg(*Src), Inserted, Offset);
 
   return true;
 }
 
 bool IRTranslator::translateSelect(const User &U,
                                    MachineIRBuilder &MIRBuilder) {
   unsigned Res = getOrCreateVReg(U);
   unsigned Tst = getOrCreateVReg(*U.getOperand(0));
   unsigned Op0 = getOrCreateVReg(*U.getOperand(1));
   unsigned Op1 = getOrCreateVReg(*U.getOperand(2));
   MIRBuilder.buildSelect(Res, Tst, Op0, Op1);
   return true;
 }
 
 bool IRTranslator::translateBitCast(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
   // If we're bitcasting to the source type, we can reuse the source vreg.
   if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
       getLLTForType(*U.getType(), *DL)) {
     // Get the source vreg now, to avoid invalidating ValToVReg.
     unsigned SrcReg = getOrCreateVReg(*U.getOperand(0));
     unsigned &Reg = ValToVReg[&U];
     // If we already assigned a vreg for this bitcast, we can't change that.
     // Emit a copy to satisfy the users we already emitted.
     if (Reg)
       MIRBuilder.buildCopy(Reg, SrcReg);
     else
       Reg = SrcReg;
     return true;
   }
   return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
 }
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
                                  MachineIRBuilder &MIRBuilder) {
   unsigned Op = getOrCreateVReg(*U.getOperand(0));
   unsigned Res = getOrCreateVReg(U);
   MIRBuilder.buildInstr(Opcode).addDef(Res).addUse(Op);
   return true;
 }
 
 bool IRTranslator::translateGetElementPtr(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   // FIXME: support vector GEPs.
   if (U.getType()->isVectorTy())
     return false;
 
   Value &Op0 = *U.getOperand(0);
   unsigned BaseReg = getOrCreateVReg(Op0);
   Type *PtrIRTy = Op0.getType();
   LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
   Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy);
   LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
 
   int64_t Offset = 0;
   for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U);
        GTI != E; ++GTI) {
     const Value *Idx = GTI.getOperand();
     if (StructType *StTy = GTI.getStructTypeOrNull()) {
       unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
       Offset += DL->getStructLayout(StTy)->getElementOffset(Field);
       continue;
     } else {
       uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
 
       // If this is a scalar constant or a splat vector of constants,
       // handle it quickly.
       if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
         Offset += ElementSize * CI->getSExtValue();
         continue;
       }
 
       if (Offset != 0) {
         unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
         unsigned OffsetReg =
             getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));
         MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg);
 
         BaseReg = NewBaseReg;
         Offset = 0;
       }
 
       // N = N + Idx * ElementSize;
       unsigned ElementSizeReg =
           getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize));
 
       unsigned IdxReg = getOrCreateVReg(*Idx);
       if (MRI->getType(IdxReg) != OffsetTy) {
         unsigned NewIdxReg = MRI->createGenericVirtualRegister(OffsetTy);
         MIRBuilder.buildSExtOrTrunc(NewIdxReg, IdxReg);
         IdxReg = NewIdxReg;
       }
 
       unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
       MIRBuilder.buildMul(OffsetReg, ElementSizeReg, IdxReg);
 
       unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
       MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg);
       BaseReg = NewBaseReg;
     }
   }
 
   if (Offset != 0) {
     unsigned OffsetReg = getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));
     MIRBuilder.buildGEP(getOrCreateVReg(U), BaseReg, OffsetReg);
     return true;
   }
 
   MIRBuilder.buildCopy(getOrCreateVReg(U), BaseReg);
   return true;
 }
 
 bool IRTranslator::translateMemfunc(const CallInst &CI,
                                     MachineIRBuilder &MIRBuilder,
                                     unsigned ID) {
   LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL);
   Type *DstTy = CI.getArgOperand(0)->getType();
   if (cast<PointerType>(DstTy)->getAddressSpace() != 0 ||
       SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0))
     return false;
 
   SmallVector<CallLowering::ArgInfo, 8> Args;
   for (int i = 0; i < 3; ++i) {
     const auto &Arg = CI.getArgOperand(i);
     Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());
   }
 
   const char *Callee;
   switch (ID) {
   case Intrinsic::memmove:
   case Intrinsic::memcpy: {
     Type *SrcTy = CI.getArgOperand(1)->getType();
     if(cast<PointerType>(SrcTy)->getAddressSpace() != 0)
       return false;
     Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove";
     break;
   }
   case Intrinsic::memset:
     Callee = "memset";
     break;
   default:
     return false;
   }
 
   return CLI->lowerCall(MIRBuilder, CI.getCallingConv(),
                         MachineOperand::CreateES(Callee),
                         CallLowering::ArgInfo(0, CI.getType()), Args);
 }
 
 void IRTranslator::getStackGuard(unsigned DstReg,
                                  MachineIRBuilder &MIRBuilder) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF));
   auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD);
   MIB.addDef(DstReg);
 
   auto &TLI = *MF->getSubtarget().getTargetLowering();
   Value *Global = TLI.getSDagStackGuard(*MF->getFunction().getParent());
   if (!Global)
     return;
 
   MachinePointerInfo MPInfo(Global);
   MachineInstr::mmo_iterator MemRefs = MF->allocateMemRefsArray(1);
   auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
                MachineMemOperand::MODereferenceable;
   *MemRefs =
       MF->getMachineMemOperand(MPInfo, Flags, DL->getPointerSizeInBits() / 8,
                                DL->getPointerABIAlignment(0));
   MIB.setMemRefs(MemRefs, MemRefs + 1);
 }
 
 bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                                               MachineIRBuilder &MIRBuilder) {
   LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL);
   LLT s1 = LLT::scalar(1);
   unsigned Width = Ty.getSizeInBits();
   unsigned Res = MRI->createGenericVirtualRegister(Ty);
   unsigned Overflow = MRI->createGenericVirtualRegister(s1);
   auto MIB = MIRBuilder.buildInstr(Op)
                  .addDef(Res)
                  .addDef(Overflow)
                  .addUse(getOrCreateVReg(*CI.getOperand(0)))
                  .addUse(getOrCreateVReg(*CI.getOperand(1)));
 
   if (Op == TargetOpcode::G_UADDE || Op == TargetOpcode::G_USUBE) {
     unsigned Zero = getOrCreateVReg(
         *Constant::getNullValue(Type::getInt1Ty(CI.getContext())));
     MIB.addUse(Zero);
   }
 
   MIRBuilder.buildSequence(getOrCreateVReg(CI), {Res, Overflow}, {0, Width});
   return true;
 }
 
 bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                            MachineIRBuilder &MIRBuilder) {
   switch (ID) {
   default:
     break;
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
     // Stack coloring is not enabled in O0 (which we care about now) so we can
     // drop these. Make sure someone notices when we start compiling at higher
     // opts though.
     if (MF->getTarget().getOptLevel() != CodeGenOpt::None)
       return false;
     return true;
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI);
     assert(DI.getVariable() && "Missing variable");
 
     const Value *Address = DI.getAddress();
     if (!Address || isa<UndefValue>(Address)) {
       DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       return true;
     }
 
     assert(DI.getVariable()->isValidLocationForIntrinsic(
                MIRBuilder.getDebugLoc()) &&
            "Expected inlined-at fields to agree");
     auto AI = dyn_cast<AllocaInst>(Address);
     if (AI && AI->isStaticAlloca()) {
       // Static allocas are tracked at the MF level, no need for DBG_VALUE
       // instructions (in fact, they get ignored if they *do* exist).
       MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(),
                              getOrCreateFrameIndex(*AI), DI.getDebugLoc());
     } else
       MIRBuilder.buildDirectDbgValue(getOrCreateVReg(*Address),
                                      DI.getVariable(), DI.getExpression());
     return true;
   }
   case Intrinsic::vaend:
     // No target I know of cares about va_end. Certainly no in-tree target
     // does. Simplest intrinsic ever!
     return true;
   case Intrinsic::vastart: {
     auto &TLI = *MF->getSubtarget().getTargetLowering();
     Value *Ptr = CI.getArgOperand(0);
     unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
 
     MIRBuilder.buildInstr(TargetOpcode::G_VASTART)
         .addUse(getOrCreateVReg(*Ptr))
         .addMemOperand(MF->getMachineMemOperand(
             MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 0));
     return true;
   }
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst &DI = cast<DbgValueInst>(CI);
     const Value *V = DI.getValue();
     assert(DI.getVariable()->isValidLocationForIntrinsic(
                MIRBuilder.getDebugLoc()) &&
            "Expected inlined-at fields to agree");
     if (!V) {
       // Currently the optimizer can produce this; insert an undef to
       // help debugging.  Probably the optimizer should not do this.
       MIRBuilder.buildIndirectDbgValue(0, DI.getVariable(), DI.getExpression());
     } else if (const auto *CI = dyn_cast<Constant>(V)) {
       MIRBuilder.buildConstDbgValue(*CI, DI.getVariable(), DI.getExpression());
     } else {
       unsigned Reg = getOrCreateVReg(*V);
       // FIXME: This does not handle register-indirect values at offset 0. The
       // direct/indirect thing shouldn't really be handled by something as
       // implicit as reg+noreg vs reg+imm in the first palce, but it seems
       // pretty baked in right now.
       MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression());
     }
     return true;
   }
   case Intrinsic::uadd_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UADDE, MIRBuilder);
   case Intrinsic::sadd_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_SADDO, MIRBuilder);
   case Intrinsic::usub_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_USUBE, MIRBuilder);
   case Intrinsic::ssub_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_SSUBO, MIRBuilder);
   case Intrinsic::umul_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);
   case Intrinsic::smul_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder);
   case Intrinsic::pow:
     MIRBuilder.buildInstr(TargetOpcode::G_FPOW)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
     return true;
   case Intrinsic::exp:
     MIRBuilder.buildInstr(TargetOpcode::G_FEXP)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
     return true;
   case Intrinsic::exp2:
     MIRBuilder.buildInstr(TargetOpcode::G_FEXP2)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
     return true;
   case Intrinsic::log:
     MIRBuilder.buildInstr(TargetOpcode::G_FLOG)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
     return true;
   case Intrinsic::log2:
     MIRBuilder.buildInstr(TargetOpcode::G_FLOG2)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
     return true;
   case Intrinsic::fma:
     MIRBuilder.buildInstr(TargetOpcode::G_FMA)
         .addDef(getOrCreateVReg(CI))
         .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(1)))
         .addUse(getOrCreateVReg(*CI.getArgOperand(2)));
     return true;
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
     return translateMemfunc(CI, MIRBuilder, ID);
   case Intrinsic::eh_typeid_for: {
     GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
     unsigned Reg = getOrCreateVReg(CI);
     unsigned TypeID = MF->getTypeIDFor(GV);
     MIRBuilder.buildConstant(Reg, TypeID);
     return true;
   }
   case Intrinsic::objectsize: {
     // If we don't know by now, we're never going to know.
     const ConstantInt *Min = cast<ConstantInt>(CI.getArgOperand(1));
 
     MIRBuilder.buildConstant(getOrCreateVReg(CI), Min->isZero() ? -1ULL : 0);
     return true;
   }
   case Intrinsic::stackguard:
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
   case Intrinsic::stackprotector: {
     LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy);
     getStackGuard(GuardVal, MIRBuilder);
 
     AllocaInst *Slot = cast<AllocaInst>(CI.getArgOperand(1));
     MIRBuilder.buildStore(
         GuardVal, getOrCreateVReg(*Slot),
         *MF->getMachineMemOperand(
             MachinePointerInfo::getFixedStack(*MF,
                                               getOrCreateFrameIndex(*Slot)),
             MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
             PtrTy.getSizeInBits() / 8, 8));
     return true;
   }
   }
   return false;
 }
 
 bool IRTranslator::translateInlineAsm(const CallInst &CI,
                                       MachineIRBuilder &MIRBuilder) {
   const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue());
   if (!IA.getConstraintString().empty())
     return false;
 
   unsigned ExtraInfo = 0;
   if (IA.hasSideEffects())
     ExtraInfo |= InlineAsm::Extra_HasSideEffects;
   if (IA.getDialect() == InlineAsm::AD_Intel)
     ExtraInfo |= InlineAsm::Extra_AsmDialect;
 
   MIRBuilder.buildInstr(TargetOpcode::INLINEASM)
     .addExternalSymbol(IA.getAsmString().c_str())
     .addImm(ExtraInfo);
 
   return true;
 }
 
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   const CallInst &CI = cast<CallInst>(U);
   auto TII = MF->getTarget().getIntrinsicInfo();
   const Function *F = CI.getCalledFunction();
 
+  // FIXME: support Windows dllimport function calls.
+  if (F && F->hasDLLImportStorageClass())
+    return false;
+
   if (CI.isInlineAsm())
     return translateInlineAsm(CI, MIRBuilder);
 
   Intrinsic::ID ID = Intrinsic::not_intrinsic;
   if (F && F->isIntrinsic()) {
     ID = F->getIntrinsicID();
     if (TII && ID == Intrinsic::not_intrinsic)
       ID = static_cast<Intrinsic::ID>(TII->getIntrinsicID(F));
   }
 
   if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic) {
     unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);
     SmallVector<unsigned, 8> Args;
     for (auto &Arg: CI.arg_operands())
       Args.push_back(getOrCreateVReg(*Arg));
 
     MF->getFrameInfo().setHasCalls(true);
     return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() {
       return getOrCreateVReg(*CI.getCalledValue());
     });
   }
 
   assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
 
   if (translateKnownIntrinsic(CI, ID, MIRBuilder))
     return true;
 
   unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);
   MachineInstrBuilder MIB =
       MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory());
 
   for (auto &Arg : CI.arg_operands()) {
     // Some intrinsics take metadata parameters. Reject them.
     if (isa<MetadataAsValue>(Arg))
       return false;
     MIB.addUse(getOrCreateVReg(*Arg));
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
   const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
   if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
     uint64_t Size = Info.memVT.getStoreSize();
     MIB.addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Info.ptrVal),
                                                Info.flags, Size, Info.align));
   }
 
   return true;
 }
 
 bool IRTranslator::translateInvoke(const User &U,
                                    MachineIRBuilder &MIRBuilder) {
   const InvokeInst &I = cast<InvokeInst>(U);
   MCContext &Context = MF->getContext();
 
   const BasicBlock *ReturnBB = I.getSuccessor(0);
   const BasicBlock *EHPadBB = I.getSuccessor(1);
 
   const Value *Callee = I.getCalledValue();
   const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
     return false;
 
   // FIXME: support invoking patchpoint and statepoint intrinsics.
   if (Fn && Fn->isIntrinsic())
     return false;
 
   // FIXME: support whatever these are.
   if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
     return false;
 
   // FIXME: support Windows exception handling.
   if (!isa<LandingPadInst>(EHPadBB->front()))
     return false;
 
   // Emit the actual call, bracketed by EH_LABELs so that the MF knows about
   // the region covered by the try.
   MCSymbol *BeginSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
 
   unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I);
   SmallVector<unsigned, 8> Args;
   for (auto &Arg: I.arg_operands())
     Args.push_back(getOrCreateVReg(*Arg));
 
   if (!CLI->lowerCall(MIRBuilder, &I, Res, Args,
                       [&]() { return getOrCreateVReg(*I.getCalledValue()); }))
     return false;
 
   MCSymbol *EndSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
 
   // FIXME: track probabilities.
   MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB),
                     &ReturnMBB = getMBB(*ReturnBB);
   MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
   MIRBuilder.getMBB().addSuccessor(&ReturnMBB);
   MIRBuilder.getMBB().addSuccessor(&EHPadMBB);
   MIRBuilder.buildBr(ReturnMBB);
 
   return true;
 }
 
 bool IRTranslator::translateLandingPad(const User &U,
                                        MachineIRBuilder &MIRBuilder) {
   const LandingPadInst &LP = cast<LandingPadInst>(U);
 
   MachineBasicBlock &MBB = MIRBuilder.getMBB();
   addLandingPadInfo(LP, MBB);
 
   MBB.setIsEHPad();
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother.
   auto &TLI = *MF->getSubtarget().getTargetLowering();
   const Constant *PersonalityFn = MF->getFunction().getPersonalityFn();
   if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
       TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
     return true;
 
   // If landingpad's return type is token type, we don't create DAG nodes
   // for its exception pointer and selector value. The extraction of exception
   // pointer or selector value from token type landingpads is not currently
   // supported.
   if (LP.getType()->isTokenTy())
     return true;
 
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL)
     .addSym(MF->addLandingPad(&MBB));
 
   LLT Ty = getLLTForType(*LP.getType(), *DL);
   unsigned Undef = MRI->createGenericVirtualRegister(Ty);
   MIRBuilder.buildUndef(Undef);
 
   SmallVector<LLT, 2> Tys;
   for (Type *Ty : cast<StructType>(LP.getType())->elements())
     Tys.push_back(getLLTForType(*Ty, *DL));
   assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
 
   // Mark exception register as live in.
   unsigned ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
   if (!ExceptionReg)
     return false;
 
   MBB.addLiveIn(ExceptionReg);
   unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]),
            Tmp = MRI->createGenericVirtualRegister(Ty);
   MIRBuilder.buildCopy(VReg, ExceptionReg);
   MIRBuilder.buildInsert(Tmp, Undef, VReg, 0);
 
   unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
   if (!SelectorReg)
     return false;
 
   MBB.addLiveIn(SelectorReg);
 
   // N.b. the exception selector register always has pointer type and may not
   // match the actual IR-level type in the landingpad so an extra cast is
   // needed.
   unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]);
   MIRBuilder.buildCopy(PtrVReg, SelectorReg);
 
   VReg = MRI->createGenericVirtualRegister(Tys[1]);
   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg);
   MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg,
                          Tys[0].getSizeInBits());
   return true;
 }
 
 bool IRTranslator::translateAlloca(const User &U,
                                    MachineIRBuilder &MIRBuilder) {
   auto &AI = cast<AllocaInst>(U);
 
   if (AI.isStaticAlloca()) {
     unsigned Res = getOrCreateVReg(AI);
     int FI = getOrCreateFrameIndex(AI);
     MIRBuilder.buildFrameIndex(Res, FI);
     return true;
   }
 
   // Now we're in the harder dynamic case.
   Type *Ty = AI.getAllocatedType();
   unsigned Align =
       std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment());
 
   unsigned NumElts = getOrCreateVReg(*AI.getArraySize());
 
   Type *IntPtrIRTy = DL->getIntPtrType(AI.getType());
   LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL);
   if (MRI->getType(NumElts) != IntPtrTy) {
     unsigned ExtElts = MRI->createGenericVirtualRegister(IntPtrTy);
     MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts);
     NumElts = ExtElts;
   }
 
   unsigned AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
   unsigned TySize =
       getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty)));
   MIRBuilder.buildMul(AllocSize, NumElts, TySize);
 
   LLT PtrTy = getLLTForType(*AI.getType(), *DL);
   auto &TLI = *MF->getSubtarget().getTargetLowering();
   unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
 
   unsigned SPTmp = MRI->createGenericVirtualRegister(PtrTy);
   MIRBuilder.buildCopy(SPTmp, SPReg);
 
   unsigned AllocTmp = MRI->createGenericVirtualRegister(PtrTy);
   MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize);
 
   // Handle alignment. We have to realign if the allocation granule was smaller
   // than stack alignment, or the specific alloca requires more than stack
   // alignment.
   unsigned StackAlign =
       MF->getSubtarget().getFrameLowering()->getStackAlignment();
   Align = std::max(Align, StackAlign);
   if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) {
     // Round the size of the allocation up to the stack alignment size
     // by add SA-1 to the size. This doesn't overflow because we're computing
     // an address inside an alloca.
     unsigned AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy);
     MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align));
     AllocTmp = AlignedAlloc;
   }
 
   MIRBuilder.buildCopy(SPReg, AllocTmp);
   MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp);
 
   MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI);
   assert(MF->getFrameInfo().hasVarSizedObjects());
   return true;
 }
 
 bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
   // FIXME: We may need more info about the type. Because of how LLT works,
   // we're completely discarding the i64/double distinction here (amongst
   // others). Fortunately the ABIs I know of where that matters don't use va_arg
   // anyway but that's not guaranteed.
   MIRBuilder.buildInstr(TargetOpcode::G_VAARG)
     .addDef(getOrCreateVReg(U))
     .addUse(getOrCreateVReg(*U.getOperand(0)))
     .addImm(DL->getABITypeAlignment(U.getType()));
   return true;
 }
 
 bool IRTranslator::translateInsertElement(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   // If it is a <1 x Ty> vector, use the scalar as it is
   // not a legal vector type in LLT.
   if (U.getType()->getVectorNumElements() == 1) {
     unsigned Elt = getOrCreateVReg(*U.getOperand(1));
     ValToVReg[&U] = Elt;
     return true;
   }
   unsigned Res = getOrCreateVReg(U);
   unsigned Val = getOrCreateVReg(*U.getOperand(0));
   unsigned Elt = getOrCreateVReg(*U.getOperand(1));
   unsigned Idx = getOrCreateVReg(*U.getOperand(2));
   MIRBuilder.buildInsertVectorElement(Res, Val, Elt, Idx);
   return true;
 }
 
 bool IRTranslator::translateExtractElement(const User &U,
                                            MachineIRBuilder &MIRBuilder) {
   // If it is a <1 x Ty> vector, use the scalar as it is
   // not a legal vector type in LLT.
   if (U.getOperand(0)->getType()->getVectorNumElements() == 1) {
     unsigned Elt = getOrCreateVReg(*U.getOperand(0));
     ValToVReg[&U] = Elt;
     return true;
   }
   unsigned Res = getOrCreateVReg(U);
   unsigned Val = getOrCreateVReg(*U.getOperand(0));
   unsigned Idx = getOrCreateVReg(*U.getOperand(1));
   MIRBuilder.buildExtractVectorElement(Res, Val, Idx);
   return true;
 }
 
 bool IRTranslator::translateShuffleVector(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR)
       .addDef(getOrCreateVReg(U))
       .addUse(getOrCreateVReg(*U.getOperand(0)))
       .addUse(getOrCreateVReg(*U.getOperand(1)))
       .addUse(getOrCreateVReg(*U.getOperand(2)));
   return true;
 }
 
 bool IRTranslator::translatePHI(const User &U, MachineIRBuilder &MIRBuilder) {
   const PHINode &PI = cast<PHINode>(U);
   auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
   MIB.addDef(getOrCreateVReg(PI));
 
   PendingPHIs.emplace_back(&PI, MIB.getInstr());
   return true;
 }
 
 void IRTranslator::finishPendingPhis() {
   for (std::pair<const PHINode *, MachineInstr *> &Phi : PendingPHIs) {
     const PHINode *PI = Phi.first;
     MachineInstrBuilder MIB(*MF, Phi.second);
 
     // All MachineBasicBlocks exist, add them to the PHI. We assume IRTranslator
     // won't create extra control flow here, otherwise we need to find the
     // dominating predecessor here (or perhaps force the weirder IRTranslators
     // to provide a simple boundary).
     SmallSet<const BasicBlock *, 4> HandledPreds;
 
     for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) {
       auto IRPred = PI->getIncomingBlock(i);
       if (HandledPreds.count(IRPred))
         continue;
 
       HandledPreds.insert(IRPred);
       unsigned ValReg = getOrCreateVReg(*PI->getIncomingValue(i));
       for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) {
         assert(Pred->isSuccessor(MIB->getParent()) &&
                "incorrect CFG at MachineBasicBlock level");
         MIB.addUse(ValReg);
         MIB.addMBB(Pred);
       }
     }
   }
 }
 
 bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder.setDebugLoc(Inst.getDebugLoc());
   switch(Inst.getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
     case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder);
 #include "llvm/IR/Instruction.def"
   default:
     return false;
   }
 }
 
 bool IRTranslator::translate(const Constant &C, unsigned Reg) {
   if (auto CI = dyn_cast<ConstantInt>(&C))
     EntryBuilder.buildConstant(Reg, *CI);
   else if (auto CF = dyn_cast<ConstantFP>(&C))
     EntryBuilder.buildFConstant(Reg, *CF);
   else if (isa<UndefValue>(C))
     EntryBuilder.buildUndef(Reg);
   else if (isa<ConstantPointerNull>(C))
     EntryBuilder.buildConstant(Reg, 0);
   else if (auto GV = dyn_cast<GlobalValue>(&C))
     EntryBuilder.buildGlobalValue(Reg, GV);
   else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
     if (!CAZ->getType()->isVectorTy())
       return false;
     // Return the scalar if it is a <1 x Ty> vector.
     if (CAZ->getNumElements() == 1)
       return translate(*CAZ->getElementValue(0u), Reg);
     std::vector<unsigned> Ops;
     for (unsigned i = 0; i < CAZ->getNumElements(); ++i) {
       Constant &Elt = *CAZ->getElementValue(i);
       Ops.push_back(getOrCreateVReg(Elt));
     }
     EntryBuilder.buildMerge(Reg, Ops);
   } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) {
     // Return the scalar if it is a <1 x Ty> vector.
     if (CV->getNumElements() == 1)
       return translate(*CV->getElementAsConstant(0), Reg);
     std::vector<unsigned> Ops;
     for (unsigned i = 0; i < CV->getNumElements(); ++i) {
       Constant &Elt = *CV->getElementAsConstant(i);
       Ops.push_back(getOrCreateVReg(Elt));
     }
     EntryBuilder.buildMerge(Reg, Ops);
   } else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
     switch(CE->getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS)                         \
       case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder);
 #include "llvm/IR/Instruction.def"
     default:
       return false;
     }
   } else if (auto CS = dyn_cast<ConstantStruct>(&C)) {
     // Return the element if it is a single element ConstantStruct.
     if (CS->getNumOperands() == 1) {
       unsigned EltReg = getOrCreateVReg(*CS->getOperand(0));
       EntryBuilder.buildCast(Reg, EltReg);
       return true;
     }
     SmallVector<unsigned, 4> Ops;
     SmallVector<uint64_t, 4> Indices;
     uint64_t Offset = 0;
     for (unsigned i = 0; i < CS->getNumOperands(); ++i) {
       unsigned OpReg = getOrCreateVReg(*CS->getOperand(i));
       Ops.push_back(OpReg);
       Indices.push_back(Offset);
       Offset += MRI->getType(OpReg).getSizeInBits();
     }
     EntryBuilder.buildSequence(Reg, Ops, Indices);
   } else if (auto CV = dyn_cast<ConstantVector>(&C)) {
     if (CV->getNumOperands() == 1)
       return translate(*CV->getOperand(0), Reg);
     SmallVector<unsigned, 4> Ops;
     for (unsigned i = 0; i < CV->getNumOperands(); ++i) {
       Ops.push_back(getOrCreateVReg(*CV->getOperand(i)));
     }
     EntryBuilder.buildMerge(Reg, Ops);
   } else
     return false;
 
   return true;
 }
 
 void IRTranslator::finalizeFunction() {
   // Release the memory used by the different maps we
   // needed during the translation.
   PendingPHIs.clear();
   ValToVReg.clear();
   FrameIndices.clear();
   MachinePreds.clear();
   // MachineIRBuilder::DebugLoc can outlive the DILocation it holds. Clear it
   // to avoid accessing free’d memory (in runOnMachineFunction) and to avoid
   // destroying it twice (in ~IRTranslator() and ~LLVMContext())
   EntryBuilder = MachineIRBuilder();
   CurBuilder = MachineIRBuilder();
 }
 
 bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MF = &CurMF;
   const Function &F = MF->getFunction();
   if (F.empty())
     return false;
   CLI = MF->getSubtarget().getCallLowering();
   CurBuilder.setMF(*MF);
   EntryBuilder.setMF(*MF);
   MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
   TPC = &getAnalysis<TargetPassConfig>();
   ORE = llvm::make_unique<OptimizationRemarkEmitter>(&F);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
   if (!DL->isLittleEndian()) {
     // Currently we don't properly handle big endian code.
     OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                F.getSubprogram(), &F.getEntryBlock());
     R << "unable to translate in big endian mode";
     reportTranslationError(*MF, *TPC, *ORE, R);
   }
 
   // Release the per-function state when we return, whether we succeeded or not.
   auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); });
 
   // Setup a separate basic-block for the arguments and constants
   MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock();
   MF->push_back(EntryBB);
   EntryBuilder.setMBB(*EntryBB);
 
   // Create all blocks, in IR order, to preserve the layout.
   for (const BasicBlock &BB: F) {
     auto *&MBB = BBToMBB[&BB];
 
     MBB = MF->CreateMachineBasicBlock(&BB);
     MF->push_back(MBB);
 
     if (BB.hasAddressTaken())
       MBB->setHasAddressTaken();
   }
 
   // Make our arguments/constants entry block fallthrough to the IR entry block.
   EntryBB->addSuccessor(&getMBB(F.front()));
 
   // Lower the actual args into this basic block.
   SmallVector<unsigned, 8> VRegArgs;
   for (const Argument &Arg: F.args()) {
     if (DL->getTypeStoreSize(Arg.getType()) == 0)
       continue; // Don't handle zero sized types.
     VRegArgs.push_back(getOrCreateVReg(Arg));
   }
   if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) {
     OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                F.getSubprogram(), &F.getEntryBlock());
     R << "unable to lower arguments: " << ore::NV("Prototype", F.getType());
     reportTranslationError(*MF, *TPC, *ORE, R);
     return false;
   }
 
   // And translate the function!
   for (const BasicBlock &BB: F) {
     MachineBasicBlock &MBB = getMBB(BB);
     // Set the insertion point of all the following translations to
     // the end of this basic block.
     CurBuilder.setMBB(MBB);
 
     for (const Instruction &Inst: BB) {
       if (translate(Inst))
         continue;
 
       OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                  Inst.getDebugLoc(), &BB);
       R << "unable to translate instruction: " << ore::NV("Opcode", &Inst);
 
       if (ORE->allowExtraAnalysis("gisel-irtranslator")) {
         std::string InstStrStorage;
         raw_string_ostream InstStr(InstStrStorage);
         InstStr << Inst;
 
         R << ": '" << InstStr.str() << "'";
       }
 
       reportTranslationError(*MF, *TPC, *ORE, R);
       return false;
     }
   }
 
   finishPendingPhis();
 
   // Merge the argument lowering and constants block with its single
   // successor, the LLVM-IR entry block.  We want the basic block to
   // be maximal.
   assert(EntryBB->succ_size() == 1 &&
          "Custom BB used for lowering should have only one successor");
   // Get the successor of the current entry block.
   MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin();
   assert(NewEntryBB.pred_size() == 1 &&
          "LLVM-IR entry block has a predecessor!?");
   // Move all the instruction from the current entry block to the
   // new entry block.
   NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(),
                     EntryBB->end());
 
   // Update the live-in information for the new entry block.
   for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins())
     NewEntryBB.addLiveIn(LiveIn);
   NewEntryBB.sortUniqueLiveIns();
 
   // Get rid of the now empty basic block.
   EntryBB->removeSuccessor(&NewEntryBB);
   MF->remove(EntryBB);
   MF->DeleteMachineBasicBlock(EntryBB);
 
   assert(&MF->front() == &NewEntryBB &&
          "New entry wasn't next in the list of basic block!");
 
   return false;
 }
Index: head/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
===================================================================
--- head/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp	(revision 328752)
+++ head/contrib/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp	(revision 328753)
@@ -1,938 +1,955 @@
 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file This file implements the LegalizerHelper class to legalize
 /// individual instructions and the LegalizeMachineIR wrapper pass for the
 /// primary legalization.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 
 #define DEBUG_TYPE "legalizer"
 
 using namespace llvm;
 
 LegalizerHelper::LegalizerHelper(MachineFunction &MF)
     : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) {
   MIRBuilder.setMF(MF);
 }
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
   DEBUG(dbgs() << "Legalizing: "; MI.print(dbgs()));
 
   auto Action = LI.getAction(MI, MRI);
   switch (std::get<0>(Action)) {
   case LegalizerInfo::Legal:
     DEBUG(dbgs() << ".. Already legal\n");
     return AlreadyLegal;
   case LegalizerInfo::Libcall:
     DEBUG(dbgs() << ".. Convert to libcall\n");
     return libcall(MI);
   case LegalizerInfo::NarrowScalar:
     DEBUG(dbgs() << ".. Narrow scalar\n");
     return narrowScalar(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::WidenScalar:
     DEBUG(dbgs() << ".. Widen scalar\n");
     return widenScalar(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::Lower:
     DEBUG(dbgs() << ".. Lower\n");
     return lower(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::FewerElements:
     DEBUG(dbgs() << ".. Reduce number of elements\n");
     return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::Custom:
     DEBUG(dbgs() << ".. Custom legalization\n");
     return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized
                                                   : UnableToLegalize;
   default:
     DEBUG(dbgs() << ".. Unable to legalize\n");
     return UnableToLegalize;
   }
 }
 
 void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts,
                                    SmallVectorImpl<unsigned> &VRegs) {
   for (int i = 0; i < NumParts; ++i)
     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
   MIRBuilder.buildUnmerge(VRegs, Reg);
 }
 
 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
   switch (Opcode) {
   case TargetOpcode::G_SDIV:
     assert(Size == 32 && "Unsupported size");
     return RTLIB::SDIV_I32;
   case TargetOpcode::G_UDIV:
     assert(Size == 32 && "Unsupported size");
     return RTLIB::UDIV_I32;
   case TargetOpcode::G_SREM:
     assert(Size == 32 && "Unsupported size");
     return RTLIB::SREM_I32;
   case TargetOpcode::G_UREM:
     assert(Size == 32 && "Unsupported size");
     return RTLIB::UREM_I32;
   case TargetOpcode::G_FADD:
     assert((Size == 32 || Size == 64) && "Unsupported size");
     return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32;
   case TargetOpcode::G_FSUB:
     assert((Size == 32 || Size == 64) && "Unsupported size");
     return Size == 64 ? RTLIB::SUB_F64 : RTLIB::SUB_F32;
   case TargetOpcode::G_FMUL:
     assert((Size == 32 || Size == 64) && "Unsupported size");
     return Size == 64 ? RTLIB::MUL_F64 : RTLIB::MUL_F32;
   case TargetOpcode::G_FDIV:
     assert((Size == 32 || Size == 64) && "Unsupported size");
     return Size == 64 ? RTLIB::DIV_F64 : RTLIB::DIV_F32;
   case TargetOpcode::G_FREM:
     return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32;
   case TargetOpcode::G_FPOW:
     return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32;
   }
   llvm_unreachable("Unknown libcall function");
 }
 
 LegalizerHelper::LegalizeResult
 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
                     const CallLowering::ArgInfo &Result,
                     ArrayRef<CallLowering::ArgInfo> Args) {
   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
   const char *Name = TLI.getLibcallName(Libcall);
 
   MIRBuilder.getMF().getFrameInfo().setHasCalls(true);
   if (!CLI.lowerCall(MIRBuilder, TLI.getLibcallCallingConv(Libcall),
                      MachineOperand::CreateES(Name), Result, Args))
     return LegalizerHelper::UnableToLegalize;
 
   return LegalizerHelper::Legalized;
 }
 
 static LegalizerHelper::LegalizeResult
 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
               Type *OpType) {
   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
   return createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), OpType},
                        {{MI.getOperand(1).getReg(), OpType},
                         {MI.getOperand(2).getReg(), OpType}});
 }
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::libcall(MachineInstr &MI) {
   LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
   unsigned Size = LLTy.getSizeInBits();
   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
 
   MIRBuilder.setInstr(MI);
 
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
   case TargetOpcode::G_SDIV:
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_SREM:
   case TargetOpcode::G_UREM: {
     Type *HLTy = Type::getInt32Ty(Ctx);
     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
     if (Status != Legalized)
       return Status;
     break;
   }
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FDIV:
   case TargetOpcode::G_FPOW:
   case TargetOpcode::G_FREM: {
     Type *HLTy = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx);
     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
     if (Status != Legalized)
       return Status;
     break;
   }
   }
 
   MI.eraseFromParent();
   return Legalized;
 }
 
 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                                                               unsigned TypeIdx,
                                                               LLT NarrowTy) {
   // FIXME: Don't know how to handle secondary types yet.
   if (TypeIdx != 0 && MI.getOpcode() != TargetOpcode::G_EXTRACT)
     return UnableToLegalize;
 
   MIRBuilder.setInstr(MI);
 
   int64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   int64_t NarrowSize = NarrowTy.getSizeInBits();
 
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
   case TargetOpcode::G_IMPLICIT_DEF: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
     int NumParts = SizeOp0 / NarrowSize;
 
     SmallVector<unsigned, 2> DstRegs;
     for (int i = 0; i < NumParts; ++i) {
       unsigned Dst = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildUndef(Dst);
       DstRegs.push_back(Dst);
     }
     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_ADD: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
     // Expand in terms of carry-setting/consuming G_ADDE instructions.
     int NumParts = SizeOp0 / NarrowTy.getSizeInBits();
 
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
     unsigned CarryIn = MRI.createGenericVirtualRegister(LLT::scalar(1));
     MIRBuilder.buildConstant(CarryIn, 0);
 
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       unsigned CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
 
       MIRBuilder.buildUAdde(DstReg, CarryOut, Src1Regs[i],
                             Src2Regs[i], CarryIn);
 
       DstRegs.push_back(DstReg);
       CarryIn = CarryOut;
     }
     unsigned DstReg = MI.getOperand(0).getReg();
     MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_EXTRACT: {
     if (TypeIdx != 1)
       return UnableToLegalize;
 
     int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
     // FIXME: add support for when SizeOp1 isn't an exact multiple of
     // NarrowSize.
     if (SizeOp1 % NarrowSize != 0)
       return UnableToLegalize;
     int NumParts = SizeOp1 / NarrowSize;
 
     SmallVector<unsigned, 2> SrcRegs, DstRegs;
     SmallVector<uint64_t, 2> Indexes;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
 
     unsigned OpReg = MI.getOperand(0).getReg();
     int64_t OpStart = MI.getOperand(2).getImm();
     int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
     for (int i = 0; i < NumParts; ++i) {
       unsigned SrcStart = i * NarrowSize;
 
       if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
         // No part of the extract uses this subregister, ignore it.
         continue;
       } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
         // The entire subregister is extracted, forward the value.
         DstRegs.push_back(SrcRegs[i]);
         continue;
       }
 
       // OpSegStart is where this destination segment would start in OpReg if it
       // extended infinitely in both directions.
       int64_t ExtractOffset, SegSize;
       if (OpStart < SrcStart) {
         ExtractOffset = 0;
         SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
       } else {
         ExtractOffset = OpStart - SrcStart;
         SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
       }
 
       unsigned SegReg = SrcRegs[i];
       if (ExtractOffset != 0 || SegSize != NarrowSize) {
         // A genuine extract is needed.
         SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
         MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
       }
 
       DstRegs.push_back(SegReg);
     }
 
     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_INSERT: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
 
     int NumParts = SizeOp0 / NarrowSize;
 
     SmallVector<unsigned, 2> SrcRegs, DstRegs;
     SmallVector<uint64_t, 2> Indexes;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
 
     unsigned OpReg = MI.getOperand(2).getReg();
     int64_t OpStart = MI.getOperand(3).getImm();
     int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstStart = i * NarrowSize;
 
       if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
         // No part of the insert affects this subregister, forward the original.
         DstRegs.push_back(SrcRegs[i]);
         continue;
       } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
         // The entire subregister is defined by this insert, forward the new
         // value.
         DstRegs.push_back(OpReg);
         continue;
       }
 
       // OpSegStart is where this destination segment would start in OpReg if it
       // extended infinitely in both directions.
       int64_t ExtractOffset, InsertOffset, SegSize;
       if (OpStart < DstStart) {
         InsertOffset = 0;
         ExtractOffset = DstStart - OpStart;
         SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
       } else {
         InsertOffset = OpStart - DstStart;
         ExtractOffset = 0;
         SegSize =
             std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
       }
 
       unsigned SegReg = OpReg;
       if (ExtractOffset != 0 || SegSize != OpSize) {
         // A genuine extract is needed.
         SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
         MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
       }
 
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset);
       DstRegs.push_back(DstReg);
     }
 
     assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered");
     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_LOAD: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
     int NumParts = SizeOp0 / NarrowSize;
     LLT OffsetTy = LLT::scalar(
         MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
 
     SmallVector<unsigned, 2> DstRegs;
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       unsigned SrcReg = 0;
       unsigned Adjustment = i * NarrowSize / 8;
 
       MIRBuilder.materializeGEP(SrcReg, MI.getOperand(1).getReg(), OffsetTy,
                                 Adjustment);
 
       // TODO: This is conservatively correct, but we probably want to split the
       // memory operands in the future.
       MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin());
 
       DstRegs.push_back(DstReg);
     }
     unsigned DstReg = MI.getOperand(0).getReg();
     MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_STORE: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
     int NumParts = SizeOp0 / NarrowSize;
     LLT OffsetTy = LLT::scalar(
         MRI.getType(MI.getOperand(1).getReg()).getScalarSizeInBits());
 
     SmallVector<unsigned, 2> SrcRegs;
     extractParts(MI.getOperand(0).getReg(), NarrowTy, NumParts, SrcRegs);
 
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = 0;
       unsigned Adjustment = i * NarrowSize / 8;
 
       MIRBuilder.materializeGEP(DstReg, MI.getOperand(1).getReg(), OffsetTy,
                                 Adjustment);
 
       // TODO: This is conservatively correct, but we probably want to split the
       // memory operands in the future.
       MIRBuilder.buildStore(SrcRegs[i], DstReg, **MI.memoperands_begin());
     }
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_CONSTANT: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
     int NumParts = SizeOp0 / NarrowSize;
     const APInt &Cst = MI.getOperand(1).getCImm()->getValue();
     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
 
     SmallVector<unsigned, 2> DstRegs;
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       ConstantInt *CI =
           ConstantInt::get(Ctx, Cst.lshr(NarrowSize * i).trunc(NarrowSize));
       MIRBuilder.buildConstant(DstReg, *CI);
       DstRegs.push_back(DstReg);
     }
     unsigned DstReg = MI.getOperand(0).getReg();
     MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_OR: {
     // Legalize bitwise operation:
     // A = BinOp<Ty> B, C
     // into:
     // B1, ..., BN = G_UNMERGE_VALUES B
     // C1, ..., CN = G_UNMERGE_VALUES C
     // A1 = BinOp<Ty/N> B1, C2
     // ...
     // AN = BinOp<Ty/N> BN, CN
     // A = G_MERGE_VALUES A1, ..., AN
 
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
     if (SizeOp0 % NarrowSize != 0)
       return UnableToLegalize;
     int NumParts = SizeOp0 / NarrowSize;
 
     // List the registers where the destination will be scattered.
     SmallVector<unsigned, 2> DstRegs;
     // List the registers where the first argument will be split.
     SmallVector<unsigned, 2> SrcsReg1;
     // List the registers where the second argument will be split.
     SmallVector<unsigned, 2> SrcsReg2;
     // Create all the temporary registers.
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       unsigned SrcReg1 = MRI.createGenericVirtualRegister(NarrowTy);
       unsigned SrcReg2 = MRI.createGenericVirtualRegister(NarrowTy);
 
       DstRegs.push_back(DstReg);
       SrcsReg1.push_back(SrcReg1);
       SrcsReg2.push_back(SrcReg2);
     }
     // Explode the big arguments into smaller chunks.
     MIRBuilder.buildUnmerge(SrcsReg1, MI.getOperand(1).getReg());
     MIRBuilder.buildUnmerge(SrcsReg2, MI.getOperand(2).getReg());
 
     // Do the operation on each small part.
     for (int i = 0; i < NumParts; ++i)
       MIRBuilder.buildOr(DstRegs[i], SrcsReg1[i], SrcsReg2[i]);
 
     // Gather the destination registers into the final destination.
     unsigned DstReg = MI.getOperand(0).getReg();
     MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
   }
 }
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   MIRBuilder.setInstr(MI);
 
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_AND:
   case TargetOpcode::G_MUL:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
   case TargetOpcode::G_SUB:
   case TargetOpcode::G_SHL: {
     // Perform operation at larger width (any extension is fine here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
     unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy);
     unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(1).getReg());
     MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(2).getReg());
 
     unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(MI.getOpcode())
         .addDef(DstExt)
         .addUse(Src1Ext)
         .addUse(Src2Ext);
 
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_SDIV:
   case TargetOpcode::G_UDIV:
   case TargetOpcode::G_SREM:
   case TargetOpcode::G_UREM:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR: {
     unsigned ExtOp = MI.getOpcode() == TargetOpcode::G_SDIV ||
                              MI.getOpcode() == TargetOpcode::G_SREM ||
                              MI.getOpcode() == TargetOpcode::G_ASHR
                          ? TargetOpcode::G_SEXT
                          : TargetOpcode::G_ZEXT;
 
     unsigned LHSExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(ExtOp).addDef(LHSExt).addUse(
         MI.getOperand(1).getReg());
 
     unsigned RHSExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(ExtOp).addDef(RHSExt).addUse(
         MI.getOperand(2).getReg());
 
     unsigned ResExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(MI.getOpcode())
         .addDef(ResExt)
         .addUse(LHSExt)
         .addUse(RHSExt);
 
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), ResExt);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_SELECT: {
     if (TypeIdx != 0)
       return UnableToLegalize;
 
     // Perform operation at larger width (any extension is fine here, high bits
     // don't affect the result) and then truncate the result back to the
     // original type.
     unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy);
     unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg());
     MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg());
 
     unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(TargetOpcode::G_SELECT)
         .addDef(DstExt)
         .addReg(MI.getOperand(1).getReg())
         .addUse(Src1Ext)
         .addUse(Src2Ext);
 
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI: {
     if (TypeIdx != 0)
       return UnableToLegalize;
 
     unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(MI.getOpcode())
         .addDef(DstExt)
         .addUse(MI.getOperand(1).getReg());
 
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP: {
     if (TypeIdx != 1)
       return UnableToLegalize;
 
     unsigned Src = MI.getOperand(1).getReg();
     unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
 
     if (MI.getOpcode() == TargetOpcode::G_SITOFP) {
       MIRBuilder.buildSExt(SrcExt, Src);
     } else {
       assert(MI.getOpcode() == TargetOpcode::G_UITOFP && "Unexpected conv op");
       MIRBuilder.buildZExt(SrcExt, Src);
     }
 
     MIRBuilder.buildInstr(MI.getOpcode())
         .addDef(MI.getOperand(0).getReg())
         .addUse(SrcExt);
 
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_INSERT: {
     if (TypeIdx != 0)
       return UnableToLegalize;
 
     unsigned Src = MI.getOperand(1).getReg();
     unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildAnyExt(SrcExt, Src);
 
     unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
     auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(),
                                       MI.getOperand(3).getImm());
     for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) {
       MIB.addReg(MI.getOperand(OpNum).getReg());
       MIB.addImm(MI.getOperand(OpNum + 1).getImm());
     }
 
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_LOAD: {
     assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==
                WideTy.getSizeInBits() &&
            "illegal to increase number of bytes loaded");
 
     unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildLoad(DstExt, MI.getOperand(1).getReg(),
                          **MI.memoperands_begin());
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_STORE: {
     if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) ||
         WideTy != LLT::scalar(8))
       return UnableToLegalize;
 
     auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
     auto Content = TLI.getBooleanContents(false, false);
 
     unsigned ExtOp = TargetOpcode::G_ANYEXT;
     if (Content == TargetLoweringBase::ZeroOrOneBooleanContent)
       ExtOp = TargetOpcode::G_ZEXT;
     else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent)
       ExtOp = TargetOpcode::G_SEXT;
     else
       ExtOp = TargetOpcode::G_ANYEXT;
 
     unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse(
         MI.getOperand(0).getReg());
     MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(),
                           **MI.memoperands_begin());
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_CONSTANT: {
     unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildConstant(DstExt, *MI.getOperand(1).getCImm());
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_FCONSTANT: {
     unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildFConstant(DstExt, *MI.getOperand(1).getFPImm());
+    const ConstantFP *CFP = MI.getOperand(1).getFPImm();
+    APFloat Val = CFP->getValueAPF();
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
+    auto LLT2Sem = [](LLT Ty) {
+      switch (Ty.getSizeInBits()) {
+      case 32:
+        return &APFloat::IEEEsingle();
+        break;
+      case 64:
+        return &APFloat::IEEEdouble();
+        break;
+      default:
+        llvm_unreachable("Unhandled fp widen type");
+      }
+    };
+    bool LosesInfo;
+    Val.convert(*LLT2Sem(WideTy), APFloat::rmTowardZero, &LosesInfo);
+    MIRBuilder.buildFConstant(DstExt, *ConstantFP::get(Ctx, Val));
     MIRBuilder.buildFPTrunc(MI.getOperand(0).getReg(), DstExt);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_BRCOND: {
     unsigned TstExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildAnyExt(TstExt, MI.getOperand(0).getReg());
     MIRBuilder.buildBrCond(TstExt, *MI.getOperand(1).getMBB());
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_FCMP: {
     unsigned Op0Ext, Op1Ext, DstReg;
     unsigned Cmp1 = MI.getOperand(2).getReg();
     unsigned Cmp2 = MI.getOperand(3).getReg();
     if (TypeIdx == 0) {
       Op0Ext = Cmp1;
       Op1Ext = Cmp2;
       DstReg = MRI.createGenericVirtualRegister(WideTy);
     } else {
       Op0Ext = MRI.createGenericVirtualRegister(WideTy);
       Op1Ext = MRI.createGenericVirtualRegister(WideTy);
       DstReg = MI.getOperand(0).getReg();
       MIRBuilder.buildInstr(TargetOpcode::G_FPEXT, Op0Ext, Cmp1);
       MIRBuilder.buildInstr(TargetOpcode::G_FPEXT, Op1Ext, Cmp2);
     }
     MIRBuilder.buildFCmp(
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()),
         DstReg, Op0Ext, Op1Ext);
     if (TypeIdx == 0)
       MIRBuilder.buildInstr(TargetOpcode::G_TRUNC, MI.getOperand(0).getReg(),
                             DstReg);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_ICMP: {
     bool IsSigned = CmpInst::isSigned(
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()));
     unsigned Cmp1 = MI.getOperand(2).getReg();
     unsigned Cmp2 = MI.getOperand(3).getReg();
     unsigned Op0Ext, Op1Ext, DstReg;
     if (TypeIdx == 0) {
       Op0Ext = Cmp1;
       Op1Ext = Cmp2;
       DstReg = MRI.createGenericVirtualRegister(WideTy);
     } else {
       Op0Ext = MRI.createGenericVirtualRegister(WideTy);
       Op1Ext = MRI.createGenericVirtualRegister(WideTy);
       DstReg = MI.getOperand(0).getReg();
       if (IsSigned) {
         MIRBuilder.buildSExt(Op0Ext, Cmp1);
         MIRBuilder.buildSExt(Op1Ext, Cmp2);
       } else {
         MIRBuilder.buildZExt(Op0Ext, Cmp1);
         MIRBuilder.buildZExt(Op1Ext, Cmp2);
       }
     }
     MIRBuilder.buildICmp(
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()),
         DstReg, Op0Ext, Op1Ext);
     if (TypeIdx == 0)
       MIRBuilder.buildInstr(TargetOpcode::G_TRUNC, MI.getOperand(0).getReg(),
                             DstReg);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_GEP: {
     assert(TypeIdx == 1 && "unable to legalize pointer of GEP");
     unsigned OffsetExt = MRI.createGenericVirtualRegister(WideTy);
     MIRBuilder.buildSExt(OffsetExt, MI.getOperand(2).getReg());
     MI.getOperand(2).setReg(OffsetExt);
     return Legalized;
   }
   case TargetOpcode::G_PHI: {
     assert(TypeIdx == 0 && "Expecting only Idx 0");
     auto getExtendedReg = [&](unsigned Reg, MachineBasicBlock &MBB) {
       auto FirstTermIt = MBB.getFirstTerminator();
       MIRBuilder.setInsertPt(MBB, FirstTermIt);
       MachineInstr *DefMI = MRI.getVRegDef(Reg);
       MachineInstrBuilder MIB;
       if (DefMI->getOpcode() == TargetOpcode::G_TRUNC)
         MIB = MIRBuilder.buildAnyExtOrTrunc(WideTy,
                                             DefMI->getOperand(1).getReg());
       else
         MIB = MIRBuilder.buildAnyExt(WideTy, Reg);
       return MIB->getOperand(0).getReg();
     };
     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_PHI, WideTy);
     for (auto OpIt = MI.operands_begin() + 1, OpE = MI.operands_end();
          OpIt != OpE;) {
       unsigned Reg = OpIt++->getReg();
       MachineBasicBlock *OpMBB = OpIt++->getMBB();
       MIB.addReg(getExtendedReg(Reg, *OpMBB));
       MIB.addMBB(OpMBB);
     }
     auto *MBB = MI.getParent();
     MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI());
     MIRBuilder.buildTrunc(MI.getOperand(0).getReg(),
                           MIB->getOperand(0).getReg());
     MI.eraseFromParent();
     return Legalized;
   }
   }
 }
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   using namespace TargetOpcode;
   MIRBuilder.setInstr(MI);
 
   switch(MI.getOpcode()) {
   default:
     return UnableToLegalize;
   case TargetOpcode::G_SREM:
   case TargetOpcode::G_UREM: {
     unsigned QuotReg = MRI.createGenericVirtualRegister(Ty);
     MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV)
         .addDef(QuotReg)
         .addUse(MI.getOperand(1).getReg())
         .addUse(MI.getOperand(2).getReg());
 
     unsigned ProdReg = MRI.createGenericVirtualRegister(Ty);
     MIRBuilder.buildMul(ProdReg, QuotReg, MI.getOperand(2).getReg());
     MIRBuilder.buildSub(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(),
                         ProdReg);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_SMULO:
   case TargetOpcode::G_UMULO: {
     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
     // result.
     unsigned Res = MI.getOperand(0).getReg();
     unsigned Overflow = MI.getOperand(1).getReg();
     unsigned LHS = MI.getOperand(2).getReg();
     unsigned RHS = MI.getOperand(3).getReg();
 
     MIRBuilder.buildMul(Res, LHS, RHS);
 
     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
                           ? TargetOpcode::G_SMULH
                           : TargetOpcode::G_UMULH;
 
     unsigned HiPart = MRI.createGenericVirtualRegister(Ty);
     MIRBuilder.buildInstr(Opcode)
       .addDef(HiPart)
       .addUse(LHS)
       .addUse(RHS);
 
     unsigned Zero = MRI.createGenericVirtualRegister(Ty);
     MIRBuilder.buildConstant(Zero, 0);
 
     // For *signed* multiply, overflow is detected by checking:
     // (hi != (lo >> bitwidth-1))
     if (Opcode == TargetOpcode::G_SMULH) {
       unsigned Shifted = MRI.createGenericVirtualRegister(Ty);
       unsigned ShiftAmt = MRI.createGenericVirtualRegister(Ty);
       MIRBuilder.buildConstant(ShiftAmt, Ty.getSizeInBits() - 1);
       MIRBuilder.buildInstr(TargetOpcode::G_ASHR)
         .addDef(Shifted)
         .addUse(Res)
         .addUse(ShiftAmt);
       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
     } else {
       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
     }
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_FNEG: {
     // TODO: Handle vector types once we are able to
     // represent them.
     if (Ty.isVector())
       return UnableToLegalize;
     unsigned Res = MI.getOperand(0).getReg();
     Type *ZeroTy;
     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
     switch (Ty.getSizeInBits()) {
     case 16:
       ZeroTy = Type::getHalfTy(Ctx);
       break;
     case 32:
       ZeroTy = Type::getFloatTy(Ctx);
       break;
     case 64:
       ZeroTy = Type::getDoubleTy(Ctx);
       break;
     case 128:
       ZeroTy = Type::getFP128Ty(Ctx);
       break;
     default:
       llvm_unreachable("unexpected floating-point type");
     }
     ConstantFP &ZeroForNegation =
         *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy));
     unsigned Zero = MRI.createGenericVirtualRegister(Ty);
     MIRBuilder.buildFConstant(Zero, ZeroForNegation);
     MIRBuilder.buildInstr(TargetOpcode::G_FSUB)
         .addDef(Res)
         .addUse(Zero)
         .addUse(MI.getOperand(1).getReg());
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_FSUB: {
     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
     // First, check if G_FNEG is marked as Lower. If so, we may
     // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
     if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower)
       return UnableToLegalize;
     unsigned Res = MI.getOperand(0).getReg();
     unsigned LHS = MI.getOperand(1).getReg();
     unsigned RHS = MI.getOperand(2).getReg();
     unsigned Neg = MRI.createGenericVirtualRegister(Ty);
     MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS);
     MIRBuilder.buildInstr(TargetOpcode::G_FADD)
         .addDef(Res)
         .addUse(LHS)
         .addUse(Neg);
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
     unsigned OldValRes = MI.getOperand(0).getReg();
     unsigned SuccessRes = MI.getOperand(1).getReg();
     unsigned Addr = MI.getOperand(2).getReg();
     unsigned CmpVal = MI.getOperand(3).getReg();
     unsigned NewVal = MI.getOperand(4).getReg();
     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
                                   **MI.memoperands_begin());
     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
     MI.eraseFromParent();
     return Legalized;
   }
   }
 }
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
                                      LLT NarrowTy) {
   // FIXME: Don't know how to handle secondary types yet.
   if (TypeIdx != 0)
     return UnableToLegalize;
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
   case TargetOpcode::G_ADD: {
     unsigned NarrowSize = NarrowTy.getSizeInBits();
     unsigned DstReg = MI.getOperand(0).getReg();
     unsigned Size = MRI.getType(DstReg).getSizeInBits();
     int NumParts = Size / NarrowSize;
     // FIXME: Don't know how to handle the situation where the small vectors
     // aren't all the same size yet.
     if (Size % NarrowSize != 0)
       return UnableToLegalize;
 
     MIRBuilder.setInstr(MI);
 
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildAdd(DstReg, Src1Regs[i], Src2Regs[i]);
       DstRegs.push_back(DstReg);
     }
 
     MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
   }
 }
Index: head/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
===================================================================
--- head/contrib/llvm/lib/CodeGen/RegAllocFast.cpp	(revision 328752)
+++ head/contrib/llvm/lib/CodeGen/RegAllocFast.cpp	(revision 328753)
@@ -1,1117 +1,1118 @@
 //===- RegAllocFast.cpp - A fast register allocator for debug code --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file This register allocator allocates registers to a basic block at a
 /// time, attempting to keep values in registers and reusing registers as
 /// appropriate.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <tuple>
 #include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
 STATISTIC(NumStores, "Number of stores added");
 STATISTIC(NumLoads , "Number of loads added");
 STATISTIC(NumCopies, "Number of copies coalesced");
 
 static RegisterRegAlloc
   fastRegAlloc("fast", "fast register allocator", createFastRegisterAllocator);
 
 namespace {
 
   class RegAllocFast : public MachineFunctionPass {
   public:
     static char ID;
 
     RegAllocFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1) {}
 
   private:
     MachineFrameInfo *MFI;
     MachineRegisterInfo *MRI;
     const TargetRegisterInfo *TRI;
     const TargetInstrInfo *TII;
     RegisterClassInfo RegClassInfo;
 
     /// Basic block currently being allocated.
     MachineBasicBlock *MBB;
 
     /// Maps virtual regs to the frame index where these values are spilled.
     IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
 
     /// Everything we know about a live virtual register.
     struct LiveReg {
       MachineInstr *LastUse = nullptr; ///< Last instr to use reg.
       unsigned VirtReg;                ///< Virtual register number.
       MCPhysReg PhysReg = 0;           ///< Currently held here.
       unsigned short LastOpNum = 0;    ///< OpNum on LastUse.
       bool Dirty = false;              ///< Register needs spill.
 
       explicit LiveReg(unsigned v) : VirtReg(v) {}
 
       unsigned getSparseSetIndex() const {
         return TargetRegisterInfo::virtReg2Index(VirtReg);
       }
     };
 
     using LiveRegMap = SparseSet<LiveReg>;
 
     /// This map contains entries for each virtual register that is currently
     /// available in a physical register.
     LiveRegMap LiveVirtRegs;
 
     DenseMap<unsigned, SmallVector<MachineInstr *, 4>> LiveDbgValueMap;
 
     /// Track the state of a physical register.
     enum RegState {
       /// A disabled register is not available for allocation, but an alias may
       /// be in use. A register can only be moved out of the disabled state if
       /// all aliases are disabled.
       regDisabled,
 
       /// A free register is not currently in use and can be allocated
       /// immediately without checking aliases.
       regFree,
 
       /// A reserved register has been assigned explicitly (e.g., setting up a
       /// call parameter), and it remains reserved until it is used.
       regReserved
 
       /// A register state may also be a virtual register number, indication
       /// that the physical register is currently allocated to a virtual
       /// register. In that case, LiveVirtRegs contains the inverse mapping.
     };
 
     /// One of the RegState enums, or a virtreg.
     std::vector<unsigned> PhysRegState;
 
     SmallVector<unsigned, 16> VirtDead;
     SmallVector<MachineInstr *, 32> Coalesced;
 
     /// Set of register units.
     using UsedInInstrSet = SparseSet<unsigned>;
 
     /// Set of register units that are used in the current instruction, and so
     /// cannot be allocated.
     UsedInInstrSet UsedInInstr;
 
     /// Mark a physreg as used in this instruction.
     void markRegUsedInInstr(MCPhysReg PhysReg) {
       for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
         UsedInInstr.insert(*Units);
     }
 
     /// Check if a physreg or any of its aliases are used in this instruction.
     bool isRegUsedInInstr(MCPhysReg PhysReg) const {
       for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
         if (UsedInInstr.count(*Units))
           return true;
       return false;
     }
 
     /// This flag is set when LiveRegMap will be cleared completely after
     /// spilling all live registers. LiveRegMap entries should not be erased.
     bool isBulkSpilling = false;
 
     enum : unsigned {
       spillClean = 1,
       spillDirty = 100,
       spillImpossible = ~0u
     };
 
   public:
     StringRef getPassName() const override { return "Fast Register Allocator"; }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
     MachineFunctionProperties getRequiredProperties() const override {
       return MachineFunctionProperties().set(
           MachineFunctionProperties::Property::NoPHIs);
     }
 
     MachineFunctionProperties getSetProperties() const override {
       return MachineFunctionProperties().set(
           MachineFunctionProperties::Property::NoVRegs);
     }
 
   private:
     bool runOnMachineFunction(MachineFunction &Fn) override;
     void allocateBasicBlock(MachineBasicBlock &MBB);
     void handleThroughOperands(MachineInstr &MI,
                                SmallVectorImpl<unsigned> &VirtDead);
     int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass &RC);
     bool isLastUseOfLocalReg(const MachineOperand &MO) const;
 
     void addKillFlag(const LiveReg &LRI);
     void killVirtReg(LiveRegMap::iterator LRI);
     void killVirtReg(unsigned VirtReg);
     void spillVirtReg(MachineBasicBlock::iterator MI, LiveRegMap::iterator);
     void spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg);
 
     void usePhysReg(MachineOperand &MO);
-    void definePhysReg(MachineInstr &MI, MCPhysReg PhysReg, RegState NewState);
+    void definePhysReg(MachineBasicBlock::iterator MI, MCPhysReg PhysReg,
+                       RegState NewState);
     unsigned calcSpillCost(MCPhysReg PhysReg) const;
-    void assignVirtToPhysReg(LiveReg&, MCPhysReg PhysReg);
+    void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg);
 
     LiveRegMap::iterator findLiveVirtReg(unsigned VirtReg) {
       return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
     }
 
     LiveRegMap::const_iterator findLiveVirtReg(unsigned VirtReg) const {
       return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
     }
 
     LiveRegMap::iterator assignVirtToPhysReg(unsigned VReg, MCPhysReg PhysReg);
     LiveRegMap::iterator allocVirtReg(MachineInstr &MI, LiveRegMap::iterator,
                                       unsigned Hint);
     LiveRegMap::iterator defineVirtReg(MachineInstr &MI, unsigned OpNum,
                                        unsigned VirtReg, unsigned Hint);
     LiveRegMap::iterator reloadVirtReg(MachineInstr &MI, unsigned OpNum,
                                        unsigned VirtReg, unsigned Hint);
     void spillAll(MachineBasicBlock::iterator MI);
     bool setPhysReg(MachineInstr &MI, unsigned OpNum, MCPhysReg PhysReg);
 
     void dumpState();
   };
 
 } // end anonymous namespace
 
 char RegAllocFast::ID = 0;
 
 INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
                 false)
 
 /// This allocates space for the specified virtual register to be held on the
 /// stack.
 int RegAllocFast::getStackSpaceFor(unsigned VirtReg,
                                    const TargetRegisterClass &RC) {
   // Find the location Reg would belong...
   int SS = StackSlotForVirtReg[VirtReg];
   // Already has space allocated?
   if (SS != -1)
     return SS;
 
   // Allocate a new stack object for this spill location...
   unsigned Size = TRI->getSpillSize(RC);
   unsigned Align = TRI->getSpillAlignment(RC);
   int FrameIdx = MFI->CreateSpillStackObject(Size, Align);
 
   // Assign the slot.
   StackSlotForVirtReg[VirtReg] = FrameIdx;
   return FrameIdx;
 }
 
 /// Return true if MO is the only remaining reference to its virtual register,
 /// and it is guaranteed to be a block-local register.
 bool RegAllocFast::isLastUseOfLocalReg(const MachineOperand &MO) const {
   // If the register has ever been spilled or reloaded, we conservatively assume
   // it is a global register used in multiple blocks.
   if (StackSlotForVirtReg[MO.getReg()] != -1)
     return false;
 
   // Check that the use/def chain has exactly one operand - MO.
   MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(MO.getReg());
   if (&*I != &MO)
     return false;
   return ++I == MRI->reg_nodbg_end();
 }
 
 /// Set kill flags on last use of a virtual register.
 void RegAllocFast::addKillFlag(const LiveReg &LR) {
   if (!LR.LastUse) return;
   MachineOperand &MO = LR.LastUse->getOperand(LR.LastOpNum);
   if (MO.isUse() && !LR.LastUse->isRegTiedToDefOperand(LR.LastOpNum)) {
     if (MO.getReg() == LR.PhysReg)
       MO.setIsKill();
     // else, don't do anything we are problably redefining a
     // subreg of this register and given we don't track which
     // lanes are actually dead, we cannot insert a kill flag here.
     // Otherwise we may end up in a situation like this:
     // ... = (MO) physreg:sub1, implicit killed physreg
     // ... <== Here we would allow later pass to reuse physreg:sub1
     //         which is potentially wrong.
     // LR:sub0 = ...
     // ... = LR.sub1 <== This is going to use physreg:sub1
   }
 }
 
 /// Mark virtreg as no longer available.
 void RegAllocFast::killVirtReg(LiveRegMap::iterator LRI) {
   addKillFlag(*LRI);
   assert(PhysRegState[LRI->PhysReg] == LRI->VirtReg &&
          "Broken RegState mapping");
   PhysRegState[LRI->PhysReg] = regFree;
   // Erase from LiveVirtRegs unless we're spilling in bulk.
   if (!isBulkSpilling)
     LiveVirtRegs.erase(LRI);
 }
 
 /// Mark virtreg as no longer available.
 void RegAllocFast::killVirtReg(unsigned VirtReg) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "killVirtReg needs a virtual register");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
   if (LRI != LiveVirtRegs.end())
     killVirtReg(LRI);
 }
 
 /// This method spills the value specified by VirtReg into the corresponding
 /// stack slot if needed.
 void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
                                 unsigned VirtReg) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Spilling a physical register is illegal!");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
   assert(LRI != LiveVirtRegs.end() && "Spilling unmapped virtual register");
   spillVirtReg(MI, LRI);
 }
 
 /// Do the actual work of spilling.
 void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
                                 LiveRegMap::iterator LRI) {
   LiveReg &LR = *LRI;
   assert(PhysRegState[LR.PhysReg] == LRI->VirtReg && "Broken RegState mapping");
 
   if (LR.Dirty) {
     // If this physreg is used by the instruction, we want to kill it on the
     // instruction, not on the spill.
     bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
     LR.Dirty = false;
     DEBUG(dbgs() << "Spilling " << printReg(LRI->VirtReg, TRI)
                  << " in " << printReg(LR.PhysReg, TRI));
     const TargetRegisterClass &RC = *MRI->getRegClass(LRI->VirtReg);
     int FI = getStackSpaceFor(LRI->VirtReg, RC);
     DEBUG(dbgs() << " to stack slot #" << FI << "\n");
     TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, &RC, TRI);
     ++NumStores;   // Update statistics
 
     // If this register is used by DBG_VALUE then insert new DBG_VALUE to
     // identify spilled location as the place to find corresponding variable's
     // value.
     SmallVectorImpl<MachineInstr *> &LRIDbgValues =
       LiveDbgValueMap[LRI->VirtReg];
     for (MachineInstr *DBG : LRIDbgValues) {
       MachineInstr *NewDV = buildDbgValueForSpill(*MBB, MI, *DBG, FI);
       assert(NewDV->getParent() == MBB && "dangling parent pointer");
       (void)NewDV;
       DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
     }
     // Now this register is spilled there is should not be any DBG_VALUE
     // pointing to this register because they are all pointing to spilled value
     // now.
     LRIDbgValues.clear();
     if (SpillKill)
       LR.LastUse = nullptr; // Don't kill register again
   }
   killVirtReg(LRI);
 }
 
 /// Spill all dirty virtregs without killing them.
 void RegAllocFast::spillAll(MachineBasicBlock::iterator MI) {
   if (LiveVirtRegs.empty()) return;
   isBulkSpilling = true;
   // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
   // of spilling here is deterministic, if arbitrary.
   for (LiveRegMap::iterator I = LiveVirtRegs.begin(), E = LiveVirtRegs.end();
        I != E; ++I)
     spillVirtReg(MI, I);
   LiveVirtRegs.clear();
   isBulkSpilling = false;
 }
 
 /// Handle the direct use of a physical register.  Check that the register is
 /// not used by a virtreg. Kill the physreg, marking it free. This may add
 /// implicit kills to MO->getParent() and invalidate MO.
 void RegAllocFast::usePhysReg(MachineOperand &MO) {
   // Ignore undef uses.
   if (MO.isUndef())
     return;
 
   unsigned PhysReg = MO.getReg();
   assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
          "Bad usePhysReg operand");
 
   markRegUsedInInstr(PhysReg);
   switch (PhysRegState[PhysReg]) {
   case regDisabled:
     break;
   case regReserved:
     PhysRegState[PhysReg] = regFree;
     LLVM_FALLTHROUGH;
   case regFree:
     MO.setIsKill();
     return;
   default:
     // The physreg was allocated to a virtual register. That means the value we
     // wanted has been clobbered.
     llvm_unreachable("Instruction uses an allocated register");
   }
 
   // Maybe a superregister is reserved?
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     MCPhysReg Alias = *AI;
     switch (PhysRegState[Alias]) {
     case regDisabled:
       break;
     case regReserved:
       // Either PhysReg is a subregister of Alias and we mark the
       // whole register as free, or PhysReg is the superregister of
       // Alias and we mark all the aliases as disabled before freeing
       // PhysReg.
       // In the latter case, since PhysReg was disabled, this means that
       // its value is defined only by physical sub-registers. This check
       // is performed by the assert of the default case in this loop.
       // Note: The value of the superregister may only be partial
       // defined, that is why regDisabled is a valid state for aliases.
       assert((TRI->isSuperRegister(PhysReg, Alias) ||
               TRI->isSuperRegister(Alias, PhysReg)) &&
              "Instruction is not using a subregister of a reserved register");
       LLVM_FALLTHROUGH;
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
         PhysRegState[Alias] = regFree;
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
       // Some other alias was in the working set - clear it.
       PhysRegState[Alias] = regDisabled;
       break;
     default:
       llvm_unreachable("Instruction uses an alias of an allocated register");
     }
   }
 
   // All aliases are disabled, bring register into working set.
   PhysRegState[PhysReg] = regFree;
   MO.setIsKill();
 }
 
 /// Mark PhysReg as reserved or free after spilling any virtregs. This is very
 /// similar to defineVirtReg except the physreg is reserved instead of
 /// allocated.
-void RegAllocFast::definePhysReg(MachineInstr &MI, MCPhysReg PhysReg,
-                                 RegState NewState) {
+void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
+                                 MCPhysReg PhysReg, RegState NewState) {
   markRegUsedInInstr(PhysReg);
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
   case regDisabled:
     break;
   default:
     spillVirtReg(MI, VirtReg);
     LLVM_FALLTHROUGH;
   case regFree:
   case regReserved:
     PhysRegState[PhysReg] = NewState;
     return;
   }
 
   // This is a disabled register, disable all aliases.
   PhysRegState[PhysReg] = NewState;
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     MCPhysReg Alias = *AI;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
     case regDisabled:
       break;
     default:
       spillVirtReg(MI, VirtReg);
       LLVM_FALLTHROUGH;
     case regFree:
     case regReserved:
       PhysRegState[Alias] = regDisabled;
       if (TRI->isSuperRegister(PhysReg, Alias))
         return;
       break;
     }
   }
 }
 
 /// \brief Return the cost of spilling clearing out PhysReg and aliases so it is
 /// free for allocation. Returns 0 when PhysReg is free or disabled with all
 /// aliases disabled - it can be allocated directly.
 /// \returns spillImpossible when PhysReg or an alias can't be spilled.
 unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
   if (isRegUsedInInstr(PhysReg)) {
     DEBUG(dbgs() << printReg(PhysReg, TRI) << " is already used in instr.\n");
     return spillImpossible;
   }
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
   case regDisabled:
     break;
   case regFree:
     return 0;
   case regReserved:
     DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding "
                  << printReg(PhysReg, TRI) << " is reserved already.\n");
     return spillImpossible;
   default: {
     LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
     assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
     return I->Dirty ? spillDirty : spillClean;
   }
   }
 
   // This is a disabled register, add up cost of aliases.
   DEBUG(dbgs() << printReg(PhysReg, TRI) << " is disabled.\n");
   unsigned Cost = 0;
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     MCPhysReg Alias = *AI;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
     case regDisabled:
       break;
     case regFree:
       ++Cost;
       break;
     case regReserved:
       return spillImpossible;
     default: {
       LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
       assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
       Cost += I->Dirty ? spillDirty : spillClean;
       break;
     }
     }
   }
   return Cost;
 }
 
 /// \brief This method updates local state so that we know that PhysReg is the
 /// proper container for VirtReg now.  The physical register must not be used
 /// for anything else when this is called.
 void RegAllocFast::assignVirtToPhysReg(LiveReg &LR, MCPhysReg PhysReg) {
   DEBUG(dbgs() << "Assigning " << printReg(LR.VirtReg, TRI) << " to "
                << printReg(PhysReg, TRI) << "\n");
   PhysRegState[PhysReg] = LR.VirtReg;
   assert(!LR.PhysReg && "Already assigned a physreg");
   LR.PhysReg = PhysReg;
 }
 
 RegAllocFast::LiveRegMap::iterator
 RegAllocFast::assignVirtToPhysReg(unsigned VirtReg, MCPhysReg PhysReg) {
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
   assert(LRI != LiveVirtRegs.end() && "VirtReg disappeared");
   assignVirtToPhysReg(*LRI, PhysReg);
   return LRI;
 }
 
 /// Allocates a physical register for VirtReg.
 RegAllocFast::LiveRegMap::iterator RegAllocFast::allocVirtReg(MachineInstr &MI,
     LiveRegMap::iterator LRI, unsigned Hint) {
   const unsigned VirtReg = LRI->VirtReg;
 
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Can only allocate virtual registers");
 
   // Take hint when possible.
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   if (TargetRegisterInfo::isPhysicalRegister(Hint) &&
       MRI->isAllocatable(Hint) && RC.contains(Hint)) {
     // Ignore the hint if we would have to spill a dirty register.
     unsigned Cost = calcSpillCost(Hint);
     if (Cost < spillDirty) {
       if (Cost)
         definePhysReg(MI, Hint, regFree);
       // definePhysReg may kill virtual registers and modify LiveVirtRegs.
       // That invalidates LRI, so run a new lookup for VirtReg.
       return assignVirtToPhysReg(VirtReg, Hint);
     }
   }
 
   // First try to find a completely free register.
   ArrayRef<MCPhysReg> AO = RegClassInfo.getOrder(&RC);
   for (MCPhysReg PhysReg : AO) {
     if (PhysRegState[PhysReg] == regFree && !isRegUsedInInstr(PhysReg)) {
       assignVirtToPhysReg(*LRI, PhysReg);
       return LRI;
     }
   }
 
   DEBUG(dbgs() << "Allocating " << printReg(VirtReg) << " from "
                << TRI->getRegClassName(&RC) << "\n");
 
   unsigned BestReg = 0;
   unsigned BestCost = spillImpossible;
   for (MCPhysReg PhysReg : AO) {
     unsigned Cost = calcSpillCost(PhysReg);
     DEBUG(dbgs() << "\tRegister: " << printReg(PhysReg, TRI) << "\n");
     DEBUG(dbgs() << "\tCost: " << Cost << "\n");
     DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n");
     // Cost is 0 when all aliases are already disabled.
     if (Cost == 0) {
       assignVirtToPhysReg(*LRI, PhysReg);
       return LRI;
     }
     if (Cost < BestCost)
       BestReg = PhysReg, BestCost = Cost;
   }
 
   if (BestReg) {
     definePhysReg(MI, BestReg, regFree);
     // definePhysReg may kill virtual registers and modify LiveVirtRegs.
     // That invalidates LRI, so run a new lookup for VirtReg.
     return assignVirtToPhysReg(VirtReg, BestReg);
   }
 
   // Nothing we can do. Report an error and keep going with a bad allocation.
   if (MI.isInlineAsm())
     MI.emitError("inline assembly requires more registers than available");
   else
     MI.emitError("ran out of registers during register allocation");
   definePhysReg(MI, *AO.begin(), regFree);
   return assignVirtToPhysReg(VirtReg, *AO.begin());
 }
 
 /// Allocates a register for VirtReg and mark it as dirty.
 RegAllocFast::LiveRegMap::iterator RegAllocFast::defineVirtReg(MachineInstr &MI,
                                                                unsigned OpNum,
                                                                unsigned VirtReg,
                                                                unsigned Hint) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
   if (New) {
     // If there is no hint, peek at the only use of this register.
     if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
         MRI->hasOneNonDBGUse(VirtReg)) {
       const MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(VirtReg);
       // It's a copy, use the destination register as a hint.
       if (UseMI.isCopyLike())
         Hint = UseMI.getOperand(0).getReg();
     }
     LRI = allocVirtReg(MI, LRI, Hint);
   } else if (LRI->LastUse) {
     // Redefining a live register - kill at the last use, unless it is this
     // instruction defining VirtReg multiple times.
     if (LRI->LastUse != &MI || LRI->LastUse->getOperand(LRI->LastOpNum).isUse())
       addKillFlag(*LRI);
   }
   assert(LRI->PhysReg && "Register not assigned");
   LRI->LastUse = &MI;
   LRI->LastOpNum = OpNum;
   LRI->Dirty = true;
   markRegUsedInInstr(LRI->PhysReg);
   return LRI;
 }
 
 /// Make sure VirtReg is available in a physreg and return it.
 RegAllocFast::LiveRegMap::iterator RegAllocFast::reloadVirtReg(MachineInstr &MI,
                                                                unsigned OpNum,
                                                                unsigned VirtReg,
                                                                unsigned Hint) {
   assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
          "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
   MachineOperand &MO = MI.getOperand(OpNum);
   if (New) {
     LRI = allocVirtReg(MI, LRI, Hint);
     const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
     int FrameIndex = getStackSpaceFor(VirtReg, RC);
     DEBUG(dbgs() << "Reloading " << printReg(VirtReg, TRI) << " into "
                  << printReg(LRI->PhysReg, TRI) << "\n");
     TII->loadRegFromStackSlot(*MBB, MI, LRI->PhysReg, FrameIndex, &RC, TRI);
     ++NumLoads;
   } else if (LRI->Dirty) {
     if (isLastUseOfLocalReg(MO)) {
       DEBUG(dbgs() << "Killing last use: " << MO << "\n");
       if (MO.isUse())
         MO.setIsKill();
       else
         MO.setIsDead();
     } else if (MO.isKill()) {
       DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
       MO.setIsKill(false);
     } else if (MO.isDead()) {
       DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
       MO.setIsDead(false);
     }
   } else if (MO.isKill()) {
     // We must remove kill flags from uses of reloaded registers because the
     // register would be killed immediately, and there might be a second use:
     //   %foo = OR killed %x, %x
     // This would cause a second reload of %x into a different register.
     DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
     MO.setIsKill(false);
   } else if (MO.isDead()) {
     DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
     MO.setIsDead(false);
   }
   assert(LRI->PhysReg && "Register not assigned");
   LRI->LastUse = &MI;
   LRI->LastOpNum = OpNum;
   markRegUsedInInstr(LRI->PhysReg);
   return LRI;
 }
 
 /// Changes operand OpNum in MI the refer the PhysReg, considering subregs. This
 /// may invalidate any operand pointers.  Return true if the operand kills its
 /// register.
 bool RegAllocFast::setPhysReg(MachineInstr &MI, unsigned OpNum,
                               MCPhysReg PhysReg) {
   MachineOperand &MO = MI.getOperand(OpNum);
   bool Dead = MO.isDead();
   if (!MO.getSubReg()) {
     MO.setReg(PhysReg);
     MO.setIsRenamableIfNoExtraRegAllocReq();
     return MO.isKill() || Dead;
   }
 
   // Handle subregister index.
   MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : 0);
   MO.setIsRenamableIfNoExtraRegAllocReq();
   MO.setSubReg(0);
 
   // A kill flag implies killing the full register. Add corresponding super
   // register kill.
   if (MO.isKill()) {
     MI.addRegisterKilled(PhysReg, TRI, true);
     return true;
   }
 
   // A <def,read-undef> of a sub-register requires an implicit def of the full
   // register.
   if (MO.isDef() && MO.isUndef())
     MI.addRegisterDefined(PhysReg, TRI);
 
   return Dead;
 }
 
 // Handles special instruction operand like early clobbers and tied ops when
 // there are additional physreg defines.
 void RegAllocFast::handleThroughOperands(MachineInstr &MI,
                                          SmallVectorImpl<unsigned> &VirtDead) {
   DEBUG(dbgs() << "Scanning for through registers:");
   SmallSet<unsigned, 8> ThroughRegs;
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
     if (MO.isEarlyClobber() || (MO.isUse() && MO.isTied()) ||
         (MO.getSubReg() && MI.readsVirtualRegister(Reg))) {
       if (ThroughRegs.insert(Reg).second)
         DEBUG(dbgs() << ' ' << printReg(Reg));
     }
   }
 
   // If any physreg defines collide with preallocated through registers,
   // we must spill and reallocate.
   DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
     markRegUsedInInstr(Reg);
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       if (ThroughRegs.count(PhysRegState[*AI]))
         definePhysReg(MI, *AI, regFree);
     }
   }
 
   SmallVector<unsigned, 8> PartialDefs;
   DEBUG(dbgs() << "Allocating tied uses.\n");
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
     if (MO.isUse()) {
       if (!MO.isTied()) continue;
       DEBUG(dbgs() << "Operand " << I << "("<< MO << ") is tied to operand "
         << MI.findTiedOperandIdx(I) << ".\n");
       LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
       MCPhysReg PhysReg = LRI->PhysReg;
       setPhysReg(MI, I, PhysReg);
       // Note: we don't update the def operand yet. That would cause the normal
       // def-scan to attempt spilling.
     } else if (MO.getSubReg() && MI.readsVirtualRegister(Reg)) {
       DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
       // Reload the register, but don't assign to the operand just yet.
       // That would confuse the later phys-def processing pass.
       LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, 0);
       PartialDefs.push_back(LRI->PhysReg);
     }
   }
 
   DEBUG(dbgs() << "Allocating early clobbers.\n");
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
     if (!MO.isEarlyClobber())
       continue;
     // Note: defineVirtReg may invalidate MO.
     LiveRegMap::iterator LRI = defineVirtReg(MI, I, Reg, 0);
     MCPhysReg PhysReg = LRI->PhysReg;
     if (setPhysReg(MI, I, PhysReg))
       VirtDead.push_back(Reg);
   }
 
   // Restore UsedInInstr to a state usable for allocating normal virtual uses.
   UsedInInstr.clear();
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
     DEBUG(dbgs() << "\tSetting " << printReg(Reg, TRI)
                  << " as used in instr\n");
     markRegUsedInInstr(Reg);
   }
 
   // Also mark PartialDefs as used to avoid reallocation.
   for (unsigned PartialDef : PartialDefs)
     markRegUsedInInstr(PartialDef);
 }
 
 #ifndef NDEBUG
 void RegAllocFast::dumpState() {
   for (unsigned Reg = 1, E = TRI->getNumRegs(); Reg != E; ++Reg) {
     if (PhysRegState[Reg] == regDisabled) continue;
     dbgs() << " " << printReg(Reg, TRI);
     switch(PhysRegState[Reg]) {
     case regFree:
       break;
     case regReserved:
       dbgs() << "*";
       break;
     default: {
       dbgs() << '=' << printReg(PhysRegState[Reg]);
       LiveRegMap::iterator I = findLiveVirtReg(PhysRegState[Reg]);
       assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
       if (I->Dirty)
         dbgs() << "*";
       assert(I->PhysReg == Reg && "Bad inverse map");
       break;
     }
     }
   }
   dbgs() << '\n';
   // Check that LiveVirtRegs is the inverse.
   for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
        e = LiveVirtRegs.end(); i != e; ++i) {
     assert(TargetRegisterInfo::isVirtualRegister(i->VirtReg) &&
            "Bad map key");
     assert(TargetRegisterInfo::isPhysicalRegister(i->PhysReg) &&
            "Bad map value");
     assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map");
   }
 }
 #endif
 
 void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   DEBUG(dbgs() << "\nAllocating " << MBB);
 
   PhysRegState.assign(TRI->getNumRegs(), regDisabled);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
 
   MachineBasicBlock::iterator MII = MBB.begin();
 
   // Add live-in registers as live.
   for (const MachineBasicBlock::RegisterMaskPair LI : MBB.liveins())
     if (MRI->isAllocatable(LI.PhysReg))
-      definePhysReg(*MII, LI.PhysReg, regReserved);
+      definePhysReg(MII, LI.PhysReg, regReserved);
 
   VirtDead.clear();
   Coalesced.clear();
 
   // Otherwise, sequentially allocate each instruction in the MBB.
   for (MachineInstr &MI : MBB) {
     const MCInstrDesc &MCID = MI.getDesc();
     DEBUG(
       dbgs() << "\n>> " << MI << "Regs:";
       dumpState()
     );
 
     // Debug values are not allowed to change codegen in any way.
     if (MI.isDebugValue()) {
       MachineInstr *DebugMI = &MI;
       MachineOperand &MO = DebugMI->getOperand(0);
 
       // Ignore DBG_VALUEs that aren't based on virtual registers. These are
       // mostly constants and frame indices.
       if (!MO.isReg())
         continue;
       unsigned Reg = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
 
       // See if this virtual register has already been allocated to a physical
       // register or spilled to a stack slot.
       LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
       if (LRI != LiveVirtRegs.end())
         setPhysReg(*DebugMI, 0, LRI->PhysReg);
       else {
         int SS = StackSlotForVirtReg[Reg];
         if (SS != -1) {
           // Modify DBG_VALUE now that the value is in a spill slot.
           updateDbgValueForSpill(*DebugMI, SS);
           DEBUG(dbgs() << "Modifying debug info due to spill:"
                        << "\t" << *DebugMI);
           continue;
         }
 
         // We can't allocate a physreg for a DebugValue, sorry!
         DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
         MO.setReg(0);
       }
 
       // If Reg hasn't been spilled, put this DBG_VALUE in LiveDbgValueMap so
       // that future spills of Reg will have DBG_VALUEs.
       LiveDbgValueMap[Reg].push_back(DebugMI);
       continue;
     }
 
     // If this is a copy, we may be able to coalesce.
     unsigned CopySrcReg = 0;
     unsigned CopyDstReg = 0;
     unsigned CopySrcSub = 0;
     unsigned CopyDstSub = 0;
     if (MI.isCopy()) {
       CopyDstReg = MI.getOperand(0).getReg();
       CopySrcReg = MI.getOperand(1).getReg();
       CopyDstSub = MI.getOperand(0).getSubReg();
       CopySrcSub = MI.getOperand(1).getSubReg();
     }
 
     // Track registers used by instruction.
     UsedInInstr.clear();
 
     // First scan.
     // Mark physreg uses and early clobbers as used.
     // Find the end of the virtreg operands
     unsigned VirtOpEnd = 0;
     bool hasTiedOps = false;
     bool hasEarlyClobbers = false;
     bool hasPartialRedefs = false;
     bool hasPhysDefs = false;
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
       // Make sure MRI knows about registers clobbered by regmasks.
       if (MO.isRegMask()) {
         MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
         continue;
       }
       if (!MO.isReg()) continue;
       unsigned Reg = MO.getReg();
       if (!Reg) continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         VirtOpEnd = i+1;
         if (MO.isUse()) {
           hasTiedOps = hasTiedOps ||
                               MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1;
         } else {
           if (MO.isEarlyClobber())
             hasEarlyClobbers = true;
           if (MO.getSubReg() && MI.readsVirtualRegister(Reg))
             hasPartialRedefs = true;
         }
         continue;
       }
       if (!MRI->isAllocatable(Reg)) continue;
       if (MO.isUse()) {
         usePhysReg(MO);
       } else if (MO.isEarlyClobber()) {
         definePhysReg(MI, Reg,
                       (MO.isImplicit() || MO.isDead()) ? regFree : regReserved);
         hasEarlyClobbers = true;
       } else
         hasPhysDefs = true;
     }
 
     // The instruction may have virtual register operands that must be allocated
     // the same register at use-time and def-time: early clobbers and tied
     // operands. If there are also physical defs, these registers must avoid
     // both physical defs and uses, making them more constrained than normal
     // operands.
     // Similarly, if there are multiple defs and tied operands, we must make
     // sure the same register is allocated to uses and defs.
     // We didn't detect inline asm tied operands above, so just make this extra
     // pass for all inline asm.
     if (MI.isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
         (hasTiedOps && (hasPhysDefs || MCID.getNumDefs() > 1))) {
       handleThroughOperands(MI, VirtDead);
       // Don't attempt coalescing when we have funny stuff going on.
       CopyDstReg = 0;
       // Pretend we have early clobbers so the use operands get marked below.
       // This is not necessary for the common case of a single tied use.
       hasEarlyClobbers = true;
     }
 
     // Second scan.
     // Allocate virtreg uses.
     for (unsigned I = 0; I != VirtOpEnd; ++I) {
       const MachineOperand &MO = MI.getOperand(I);
       if (!MO.isReg()) continue;
       unsigned Reg = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
       if (MO.isUse()) {
         LiveRegMap::iterator LRI = reloadVirtReg(MI, I, Reg, CopyDstReg);
         MCPhysReg PhysReg = LRI->PhysReg;
         CopySrcReg = (CopySrcReg == Reg || CopySrcReg == PhysReg) ? PhysReg : 0;
         if (setPhysReg(MI, I, PhysReg))
           killVirtReg(LRI);
       }
     }
 
     // Track registers defined by instruction - early clobbers and tied uses at
     // this point.
     UsedInInstr.clear();
     if (hasEarlyClobbers) {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isReg()) continue;
         unsigned Reg = MO.getReg();
         if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
         // Look for physreg defs and tied uses.
         if (!MO.isDef() && !MO.isTied()) continue;
         markRegUsedInInstr(Reg);
       }
     }
 
     unsigned DefOpEnd = MI.getNumOperands();
     if (MI.isCall()) {
       // Spill all virtregs before a call. This serves one purpose: If an
       // exception is thrown, the landing pad is going to expect to find
       // registers in their spill slots.
       // Note: although this is appealing to just consider all definitions
       // as call-clobbered, this is not correct because some of those
       // definitions may be used later on and we do not want to reuse
       // those for virtual registers in between.
       DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
       spillAll(MI);
     }
 
     // Third scan.
     // Allocate defs and collect dead defs.
     for (unsigned I = 0; I != DefOpEnd; ++I) {
       const MachineOperand &MO = MI.getOperand(I);
       if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
         continue;
       unsigned Reg = MO.getReg();
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         if (!MRI->isAllocatable(Reg)) continue;
         definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
         continue;
       }
       LiveRegMap::iterator LRI = defineVirtReg(MI, I, Reg, CopySrcReg);
       MCPhysReg PhysReg = LRI->PhysReg;
       if (setPhysReg(MI, I, PhysReg)) {
         VirtDead.push_back(Reg);
         CopyDstReg = 0; // cancel coalescing;
       } else
         CopyDstReg = (CopyDstReg == Reg || CopyDstReg == PhysReg) ? PhysReg : 0;
     }
 
     // Kill dead defs after the scan to ensure that multiple defs of the same
     // register are allocated identically. We didn't need to do this for uses
     // because we are crerating our own kill flags, and they are always at the
     // last use.
     for (unsigned VirtReg : VirtDead)
       killVirtReg(VirtReg);
     VirtDead.clear();
 
     if (CopyDstReg && CopyDstReg == CopySrcReg && CopyDstSub == CopySrcSub) {
       DEBUG(dbgs() << "-- coalescing: " << MI);
       Coalesced.push_back(&MI);
     } else {
       DEBUG(dbgs() << "<< " << MI);
     }
   }
 
   // Spill all physical registers holding virtual registers now.
   DEBUG(dbgs() << "Spilling live registers at end of block.\n");
   spillAll(MBB.getFirstTerminator());
 
   // Erase all the coalesced copies. We are delaying it until now because
   // LiveVirtRegs might refer to the instrs.
   for (MachineInstr *MI : Coalesced)
     MBB.erase(MI);
   NumCopies += Coalesced.size();
 
   DEBUG(MBB.dump());
 }
 
 /// Allocates registers for a function.
 bool RegAllocFast::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
                << "********** Function: " << MF.getName() << '\n');
   MRI = &MF.getRegInfo();
   const TargetSubtargetInfo &STI = MF.getSubtarget();
   TRI = STI.getRegisterInfo();
   TII = STI.getInstrInfo();
   MFI = &MF.getFrameInfo();
   MRI->freezeReservedRegs(MF);
   RegClassInfo.runOnMachineFunction(MF);
   UsedInInstr.clear();
   UsedInInstr.setUniverse(TRI->getNumRegUnits());
 
   // initialize the virtual->physical register map to have a 'null'
   // mapping for all virtual registers
   unsigned NumVirtRegs = MRI->getNumVirtRegs();
   StackSlotForVirtReg.resize(NumVirtRegs);
   LiveVirtRegs.setUniverse(NumVirtRegs);
 
   // Loop over all of the basic blocks, eliminating virtual register references
   for (MachineBasicBlock &MBB : MF)
     allocateBasicBlock(MBB);
 
   // All machine operands and other references to virtual registers have been
   // replaced. Remove the virtual registers.
   MRI->clearVirtRegs();
 
   StackSlotForVirtReg.clear();
   LiveDbgValueMap.clear();
   return true;
 }
 
 FunctionPass *llvm::createFastRegisterAllocator() {
   return new RegAllocFast();
 }
Index: head/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
===================================================================
--- head/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp	(revision 328752)
+++ head/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp	(revision 328753)
@@ -1,3821 +1,3823 @@
 //===- SelectionDAGISel.cpp - Implement the SelectionDAGISel class --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This implements the SelectionDAGISel class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <limits>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
 
 STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
 STATISTIC(NumFastIselSuccess, "Number of instructions fast isel selected");
 STATISTIC(NumFastIselBlocks, "Number of blocks selected entirely by fast isel");
 STATISTIC(NumDAGBlocks, "Number of blocks selected using DAG");
 STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path");
 STATISTIC(NumEntryBlocks, "Number of entry blocks encountered");
 STATISTIC(NumFastIselFailLowerArguments,
           "Number of entry blocks where fast isel failed to lower arguments");
 
 static cl::opt<int> EnableFastISelAbort(
     "fast-isel-abort", cl::Hidden,
     cl::desc("Enable abort calls when \"fast\" instruction selection "
              "fails to lower an instruction: 0 disable the abort, 1 will "
              "abort but for args, calls and terminators, 2 will also "
              "abort for argument lowering, and 3 will never fallback "
              "to SelectionDAG."));
 
 static cl::opt<bool> EnableFastISelFallbackReport(
     "fast-isel-report-on-fallback", cl::Hidden,
     cl::desc("Emit a diagnostic when \"fast\" instruction selection "
              "falls back to SelectionDAG."));
 
 static cl::opt<bool>
 UseMBPI("use-mbpi",
         cl::desc("use Machine Branch Probability Info"),
         cl::init(true), cl::Hidden);
 
 #ifndef NDEBUG
 static cl::opt<std::string>
 FilterDAGBasicBlockName("filter-view-dags", cl::Hidden,
                         cl::desc("Only display the basic block whose name "
                                  "matches this for all view-*-dags options"));
 static cl::opt<bool>
 ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before the first "
                    "dag combine pass"));
 static cl::opt<bool>
 ViewLegalizeTypesDAGs("view-legalize-types-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before legalize types"));
 static cl::opt<bool>
 ViewLegalizeDAGs("view-legalize-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before legalize"));
 static cl::opt<bool>
 ViewDAGCombine2("view-dag-combine2-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before the second "
                    "dag combine pass"));
 static cl::opt<bool>
 ViewDAGCombineLT("view-dag-combine-lt-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before the post legalize types"
                    " dag combine pass"));
 static cl::opt<bool>
 ViewISelDAGs("view-isel-dags", cl::Hidden,
           cl::desc("Pop up a window to show isel dags as they are selected"));
 static cl::opt<bool>
 ViewSchedDAGs("view-sched-dags", cl::Hidden,
           cl::desc("Pop up a window to show sched dags as they are processed"));
 static cl::opt<bool>
 ViewSUnitDAGs("view-sunit-dags", cl::Hidden,
       cl::desc("Pop up a window to show SUnit dags after they are processed"));
 #else
 static const bool ViewDAGCombine1 = false,
                   ViewLegalizeTypesDAGs = false, ViewLegalizeDAGs = false,
                   ViewDAGCombine2 = false,
                   ViewDAGCombineLT = false,
                   ViewISelDAGs = false, ViewSchedDAGs = false,
                   ViewSUnitDAGs = false;
 #endif
 
 //===---------------------------------------------------------------------===//
 ///
 /// RegisterScheduler class - Track the registration of instruction schedulers.
 ///
 //===---------------------------------------------------------------------===//
 MachinePassRegistry RegisterScheduler::Registry;
 
 //===---------------------------------------------------------------------===//
 ///
 /// ISHeuristic command line option for instruction schedulers.
 ///
 //===---------------------------------------------------------------------===//
 static cl::opt<RegisterScheduler::FunctionPassCtor, false,
                RegisterPassParser<RegisterScheduler>>
 ISHeuristic("pre-RA-sched",
             cl::init(&createDefaultScheduler), cl::Hidden,
             cl::desc("Instruction schedulers available (before register"
                      " allocation):"));
 
 static RegisterScheduler
 defaultListDAGScheduler("default", "Best scheduler for the target",
                         createDefaultScheduler);
 
 namespace llvm {
 
   //===--------------------------------------------------------------------===//
   /// \brief This class is used by SelectionDAGISel to temporarily override
   /// the optimization level on a per-function basis.
   class OptLevelChanger {
     SelectionDAGISel &IS;
     CodeGenOpt::Level SavedOptLevel;
     bool SavedFastISel;
 
   public:
     OptLevelChanger(SelectionDAGISel &ISel,
                     CodeGenOpt::Level NewOptLevel) : IS(ISel) {
       SavedOptLevel = IS.OptLevel;
       if (NewOptLevel == SavedOptLevel)
         return;
       IS.OptLevel = NewOptLevel;
       IS.TM.setOptLevel(NewOptLevel);
       DEBUG(dbgs() << "\nChanging optimization level for Function "
             << IS.MF->getFunction().getName() << "\n");
       DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
             << " ; After: -O" << NewOptLevel << "\n");
       SavedFastISel = IS.TM.Options.EnableFastISel;
       if (NewOptLevel == CodeGenOpt::None) {
         IS.TM.setFastISel(IS.TM.getO0WantsFastISel());
         DEBUG(dbgs() << "\tFastISel is "
               << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled")
               << "\n");
       }
     }
 
     ~OptLevelChanger() {
       if (IS.OptLevel == SavedOptLevel)
         return;
       DEBUG(dbgs() << "\nRestoring optimization level for Function "
             << IS.MF->getFunction().getName() << "\n");
       DEBUG(dbgs() << "\tBefore: -O" << IS.OptLevel
             << " ; After: -O" << SavedOptLevel << "\n");
       IS.OptLevel = SavedOptLevel;
       IS.TM.setOptLevel(SavedOptLevel);
       IS.TM.setFastISel(SavedFastISel);
     }
   };
 
   //===--------------------------------------------------------------------===//
   /// createDefaultScheduler - This creates an instruction scheduler appropriate
   /// for the target.
   ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
                                              CodeGenOpt::Level OptLevel) {
     const TargetLowering *TLI = IS->TLI;
     const TargetSubtargetInfo &ST = IS->MF->getSubtarget();
 
     // Try first to see if the Target has its own way of selecting a scheduler
     if (auto *SchedulerCtor = ST.getDAGScheduler(OptLevel)) {
       return SchedulerCtor(IS, OptLevel);
     }
 
     if (OptLevel == CodeGenOpt::None ||
         (ST.enableMachineScheduler() && ST.enableMachineSchedDefaultSched()) ||
         TLI->getSchedulingPreference() == Sched::Source)
       return createSourceListDAGScheduler(IS, OptLevel);
     if (TLI->getSchedulingPreference() == Sched::RegPressure)
       return createBURRListDAGScheduler(IS, OptLevel);
     if (TLI->getSchedulingPreference() == Sched::Hybrid)
       return createHybridListDAGScheduler(IS, OptLevel);
     if (TLI->getSchedulingPreference() == Sched::VLIW)
       return createVLIWDAGScheduler(IS, OptLevel);
     assert(TLI->getSchedulingPreference() == Sched::ILP &&
            "Unknown sched type!");
     return createILPListDAGScheduler(IS, OptLevel);
   }
 
 } // end namespace llvm
 
 // EmitInstrWithCustomInserter - This method should be implemented by targets
 // that mark instructions with the 'usesCustomInserter' flag.  These
 // instructions are special in various ways, which require special support to
 // insert.  The specified MachineInstr is created but not inserted into any
 // basic blocks, and this method is called to expand it into a sequence of
 // instructions, potentially also creating new basic blocks and control flow.
 // When new basic blocks are inserted and the edges from MBB to its successors
 // are modified, the method should insert pairs of <OldSucc, NewSucc> into the
 // DenseMap.
 MachineBasicBlock *
 TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                             MachineBasicBlock *MBB) const {
 #ifndef NDEBUG
   dbgs() << "If a target marks an instruction with "
           "'usesCustomInserter', it must implement "
           "TargetLowering::EmitInstrWithCustomInserter!";
 #endif
   llvm_unreachable(nullptr);
 }
 
 void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                    SDNode *Node) const {
   assert(!MI.hasPostISelHook() &&
          "If a target marks an instruction with 'hasPostISelHook', "
          "it must implement TargetLowering::AdjustInstrPostInstrSelection!");
 }
 
 //===----------------------------------------------------------------------===//
 // SelectionDAGISel code
 //===----------------------------------------------------------------------===//
 
 SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
                                    CodeGenOpt::Level OL) :
   MachineFunctionPass(ID), TM(tm),
   FuncInfo(new FunctionLoweringInfo()),
   CurDAG(new SelectionDAG(tm, OL)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
   AA(), GFI(),
   OptLevel(OL),
   DAGSize(0) {
     initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
     initializeBranchProbabilityInfoWrapperPassPass(
         *PassRegistry::getPassRegistry());
     initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
     initializeTargetLibraryInfoWrapperPassPass(
         *PassRegistry::getPassRegistry());
   }
 
 SelectionDAGISel::~SelectionDAGISel() {
   delete SDB;
   delete CurDAG;
   delete FuncInfo;
 }
 
 void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
   if (OptLevel != CodeGenOpt::None)
     AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<GCModuleInfo>();
   AU.addRequired<StackProtector>();
   AU.addPreserved<StackProtector>();
   AU.addPreserved<GCModuleInfo>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 /// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that
 /// may trap on it.  In this case we have to split the edge so that the path
 /// through the predecessor block that doesn't go to the phi block doesn't
 /// execute the possibly trapping instruction. If available, we pass domtree
 /// and loop info to be updated when we split critical edges. This is because
 /// SelectionDAGISel preserves these analyses.
 /// This is required for correctness, so it must be done at -O0.
 ///
 static void SplitCriticalSideEffectEdges(Function &Fn, DominatorTree *DT,
                                          LoopInfo *LI) {
   // Loop for blocks with phi nodes.
   for (BasicBlock &BB : Fn) {
     PHINode *PN = dyn_cast<PHINode>(BB.begin());
     if (!PN) continue;
 
   ReprocessBlock:
     // For each block with a PHI node, check to see if any of the input values
     // are potentially trapping constant expressions.  Constant expressions are
     // the only potentially trapping value that can occur as the argument to a
     // PHI.
     for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast<PHINode>(I)); ++I)
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         ConstantExpr *CE = dyn_cast<ConstantExpr>(PN->getIncomingValue(i));
         if (!CE || !CE->canTrap()) continue;
 
         // The only case we have to worry about is when the edge is critical.
         // Since this block has a PHI Node, we assume it has multiple input
         // edges: check to see if the pred has multiple successors.
         BasicBlock *Pred = PN->getIncomingBlock(i);
         if (Pred->getTerminator()->getNumSuccessors() == 1)
           continue;
 
         // Okay, we have to split this edge.
         SplitCriticalEdge(
             Pred->getTerminator(), GetSuccessorNumber(Pred, &BB),
             CriticalEdgeSplittingOptions(DT, LI).setMergeIdenticalEdges());
         goto ReprocessBlock;
       }
   }
 }
 
 bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   // If we already selected that function, we do not need to run SDISel.
   if (mf.getProperties().hasProperty(
           MachineFunctionProperties::Property::Selected))
     return false;
   // Do some sanity-checking on the command-line options.
   assert((!EnableFastISelAbort || TM.Options.EnableFastISel) &&
          "-fast-isel-abort > 0 requires -fast-isel");
 
   const Function &Fn = mf.getFunction();
   MF = &mf;
 
   // Reset the target options before resetting the optimization
   // level below.
   // FIXME: This is a horrible hack and should be processed via
   // codegen looking at the optimization level explicitly when
   // it wants to look at it.
   TM.resetTargetOptions(Fn);
   // Reset OptLevel to None for optnone functions.
   CodeGenOpt::Level NewOptLevel = OptLevel;
   if (OptLevel != CodeGenOpt::None && skipFunction(Fn))
     NewOptLevel = CodeGenOpt::None;
   OptLevelChanger OLC(*this, NewOptLevel);
 
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
   ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
 
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function &>(Fn), DT, LI);
 
   CurDAG->init(*MF, *ORE, this);
   FuncInfo->set(Fn, *MF, CurDAG);
 
   // Now get the optional analyzes if we want to.
   // This is based on the possibly changed OptLevel (after optnone is taken
   // into account).  That's unfortunate but OK because it just means we won't
   // ask for passes that have been required anyway.
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
   else
     FuncInfo->BPI = nullptr;
 
   if (OptLevel != CodeGenOpt::None)
     AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   else
     AA = nullptr;
 
   SDB->init(GFI, AA, LibInfo);
 
   MF->setHasInlineAsm(false);
 
   FuncInfo->SplitCSR = false;
 
   // We split CSR if the target supports it for the given function
   // and the function has only return exits.
   if (OptLevel != CodeGenOpt::None && TLI->supportSplitCSR(MF)) {
     FuncInfo->SplitCSR = true;
 
     // Collect all the return blocks.
     for (const BasicBlock &BB : Fn) {
       if (!succ_empty(&BB))
         continue;
 
       const TerminatorInst *Term = BB.getTerminator();
       if (isa<UnreachableInst>(Term) || isa<ReturnInst>(Term))
         continue;
 
       // Bail out if the exit block is not Return nor Unreachable.
       FuncInfo->SplitCSR = false;
       break;
     }
   }
 
   MachineBasicBlock *EntryMBB = &MF->front();
   if (FuncInfo->SplitCSR)
     // This performs initialization so lowering for SplitCSR will be correct.
     TLI->initializeSplitCSR(EntryMBB);
 
   SelectAllBasicBlocks(Fn);
   if (FastISelFailed && EnableFastISelFallbackReport) {
     DiagnosticInfoISelFallback DiagFallback(Fn);
     Fn.getContext().diagnose(DiagFallback);
   }
 
   // If the first basic block in the function has live ins that need to be
   // copied into vregs, emit the copies into the top of the block before
   // emitting the code for the block.
   const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
   RegInfo->EmitLiveInCopies(EntryMBB, TRI, *TII);
 
   // Insert copies in the entry block and the return blocks.
   if (FuncInfo->SplitCSR) {
     SmallVector<MachineBasicBlock*, 4> Returns;
     // Collect all the return blocks.
     for (MachineBasicBlock &MBB : mf) {
       if (!MBB.succ_empty())
         continue;
 
       MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
       if (Term != MBB.end() && Term->isReturn()) {
         Returns.push_back(&MBB);
         continue;
       }
     }
     TLI->insertCopiesSplitCSR(EntryMBB, Returns);
   }
 
   DenseMap<unsigned, unsigned> LiveInMap;
   if (!FuncInfo->ArgDbgValues.empty())
     for (std::pair<unsigned, unsigned> LI : RegInfo->liveins())
       if (LI.second)
         LiveInMap.insert(LI);
 
   // Insert DBG_VALUE instructions for function arguments to the entry block.
   for (unsigned i = 0, e = FuncInfo->ArgDbgValues.size(); i != e; ++i) {
     MachineInstr *MI = FuncInfo->ArgDbgValues[e-i-1];
     bool hasFI = MI->getOperand(0).isFI();
     unsigned Reg =
         hasFI ? TRI.getFrameRegister(*MF) : MI->getOperand(0).getReg();
     if (TargetRegisterInfo::isPhysicalRegister(Reg))
       EntryMBB->insert(EntryMBB->begin(), MI);
     else {
       MachineInstr *Def = RegInfo->getVRegDef(Reg);
       if (Def) {
         MachineBasicBlock::iterator InsertPos = Def;
         // FIXME: VR def may not be in entry block.
         Def->getParent()->insert(std::next(InsertPos), MI);
       } else
         DEBUG(dbgs() << "Dropping debug info for dead vreg"
               << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
     }
 
     // If Reg is live-in then update debug info to track its copy in a vreg.
     DenseMap<unsigned, unsigned>::iterator LDI = LiveInMap.find(Reg);
     if (LDI != LiveInMap.end()) {
       assert(!hasFI && "There's no handling of frame pointer updating here yet "
                        "- add if needed");
       MachineInstr *Def = RegInfo->getVRegDef(LDI->second);
       MachineBasicBlock::iterator InsertPos = Def;
       const MDNode *Variable = MI->getDebugVariable();
       const MDNode *Expr = MI->getDebugExpression();
       DebugLoc DL = MI->getDebugLoc();
       bool IsIndirect = MI->isIndirectDebugValue();
       if (IsIndirect)
         assert(MI->getOperand(1).getImm() == 0 &&
                "DBG_VALUE with nonzero offset");
       assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
              "Expected inlined-at fields to agree");
       // Def is never a terminator here, so it is ok to increment InsertPos.
       BuildMI(*EntryMBB, ++InsertPos, DL, TII->get(TargetOpcode::DBG_VALUE),
               IsIndirect, LDI->second, Variable, Expr);
 
       // If this vreg is directly copied into an exported register then
       // that COPY instructions also need DBG_VALUE, if it is the only
       // user of LDI->second.
       MachineInstr *CopyUseMI = nullptr;
       for (MachineRegisterInfo::use_instr_iterator
            UI = RegInfo->use_instr_begin(LDI->second),
            E = RegInfo->use_instr_end(); UI != E; ) {
         MachineInstr *UseMI = &*(UI++);
         if (UseMI->isDebugValue()) continue;
         if (UseMI->isCopy() && !CopyUseMI && UseMI->getParent() == EntryMBB) {
           CopyUseMI = UseMI; continue;
         }
         // Otherwise this is another use or second copy use.
         CopyUseMI = nullptr; break;
       }
       if (CopyUseMI) {
         // Use MI's debug location, which describes where Variable was
         // declared, rather than whatever is attached to CopyUseMI.
         MachineInstr *NewMI =
             BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
                     CopyUseMI->getOperand(0).getReg(), Variable, Expr);
         MachineBasicBlock::iterator Pos = CopyUseMI;
         EntryMBB->insertAfter(Pos, NewMI);
       }
     }
   }
 
   // Determine if there are any calls in this machine function.
   MachineFrameInfo &MFI = MF->getFrameInfo();
   for (const auto &MBB : *MF) {
     if (MFI.hasCalls() && MF->hasInlineAsm())
       break;
 
     for (const auto &MI : MBB) {
       const MCInstrDesc &MCID = TII->get(MI.getOpcode());
       if ((MCID.isCall() && !MCID.isReturn()) ||
           MI.isStackAligningInlineAsm()) {
         MFI.setHasCalls(true);
       }
       if (MI.isInlineAsm()) {
         MF->setHasInlineAsm(true);
       }
     }
   }
 
   // Determine if there is a call to setjmp in the machine function.
   MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
 
   // Replace forward-declared registers with the registers containing
   // the desired value.
   MachineRegisterInfo &MRI = MF->getRegInfo();
   for (DenseMap<unsigned, unsigned>::iterator
        I = FuncInfo->RegFixups.begin(), E = FuncInfo->RegFixups.end();
        I != E; ++I) {
     unsigned From = I->first;
     unsigned To = I->second;
     // If To is also scheduled to be replaced, find what its ultimate
     // replacement is.
     while (true) {
       DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
       if (J == E) break;
       To = J->second;
     }
     // Make sure the new register has a sufficiently constrained register class.
     if (TargetRegisterInfo::isVirtualRegister(From) &&
         TargetRegisterInfo::isVirtualRegister(To))
       MRI.constrainRegClass(To, MRI.getRegClass(From));
     // Replace it.
 
 
     // Replacing one register with another won't touch the kill flags.
     // We need to conservatively clear the kill flags as a kill on the old
     // register might dominate existing uses of the new register.
     if (!MRI.use_empty(To))
       MRI.clearKillFlags(From);
     MRI.replaceRegWith(From, To);
   }
 
   TLI->finalizeLowering(*MF);
 
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
   FuncInfo->clear();
 
   DEBUG(dbgs() << "*** MachineFunction at end of ISel ***\n");
   DEBUG(MF->print(dbgs()));
 
   return true;
 }
 
 static void reportFastISelFailure(MachineFunction &MF,
                                   OptimizationRemarkEmitter &ORE,
                                   OptimizationRemarkMissed &R,
                                   bool ShouldAbort) {
   // Print the function name explicitly if we don't have a debug location (which
   // makes the diagnostic less useful) or if we're going to emit a raw error.
   if (!R.getLocation().isValid() || ShouldAbort)
     R << (" (in function: " + MF.getName() + ")").str();
 
   if (ShouldAbort)
     report_fatal_error(R.getMsg());
 
   ORE.emit(R);
 }
 
 void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
                                         BasicBlock::const_iterator End,
                                         bool &HadTailCall) {
   // Allow creating illegal types during DAG building for the basic block.
   CurDAG->NewNodesMustHaveLegalTypes = false;
 
   // Lower the instructions. If a call is emitted as a tail call, cease emitting
   // nodes for this block.
   for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
     if (!ElidedArgCopyInstrs.count(&*I))
       SDB->visit(*I);
   }
 
   // Make sure the root of the DAG is up-to-date.
   CurDAG->setRoot(SDB->getControlRoot());
   HadTailCall = SDB->HasTailCall;
   SDB->clear();
 
   // Final step, emit the lowered DAG as machine code.
   CodeGenAndEmitDAG();
 }
 
 void SelectionDAGISel::ComputeLiveOutVRegInfo() {
   SmallPtrSet<SDNode*, 16> VisitedNodes;
   SmallVector<SDNode*, 128> Worklist;
 
   Worklist.push_back(CurDAG->getRoot().getNode());
 
   KnownBits Known;
 
   do {
     SDNode *N = Worklist.pop_back_val();
 
     // If we've already seen this node, ignore it.
     if (!VisitedNodes.insert(N).second)
       continue;
 
     // Otherwise, add all chain operands to the worklist.
     for (const SDValue &Op : N->op_values())
       if (Op.getValueType() == MVT::Other)
         Worklist.push_back(Op.getNode());
 
     // If this is a CopyToReg with a vreg dest, process it.
     if (N->getOpcode() != ISD::CopyToReg)
       continue;
 
     unsigned DestReg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
     if (!TargetRegisterInfo::isVirtualRegister(DestReg))
       continue;
 
     // Ignore non-scalar or non-integer values.
     SDValue Src = N->getOperand(2);
     EVT SrcVT = Src.getValueType();
     if (!SrcVT.isInteger() || SrcVT.isVector())
       continue;
 
     unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
     CurDAG->computeKnownBits(Src, Known);
     FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, Known);
   } while (!Worklist.empty());
 }
 
 void SelectionDAGISel::CodeGenAndEmitDAG() {
   StringRef GroupName = "sdag";
   StringRef GroupDescription = "Instruction Selection and Scheduling";
   std::string BlockName;
   int BlockNumber = -1;
   (void)BlockNumber;
   bool MatchFilterBB = false; (void)MatchFilterBB;
 
   // Pre-type legalization allow creation of any node types.
   CurDAG->NewNodesMustHaveLegalTypes = false;
 
 #ifndef NDEBUG
   MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
                    FilterDAGBasicBlockName ==
                        FuncInfo->MBB->getBasicBlock()->getName().str());
 #endif
 #ifdef NDEBUG
   if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
       ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
       ViewSUnitDAGs)
 #endif
   {
     BlockNumber = FuncInfo->MBB->getNumber();
     BlockName =
         (MF->getName() + ":" + FuncInfo->MBB->getBasicBlock()->getName()).str();
   }
   DEBUG(dbgs() << "Initial selection DAG: " << printMBBReference(*FuncInfo->MBB)
                << " '" << BlockName << "'\n";
         CurDAG->dump());
 
   if (ViewDAGCombine1 && MatchFilterBB)
     CurDAG->viewGraph("dag-combine1 input for " + BlockName);
 
   // Run the DAG combiner in pre-legalize mode.
   {
     NamedRegionTimer T("combine1", "DAG Combining 1", GroupName,
                        GroupDescription, TimePassesIsEnabled);
     CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel);
   }
 
   DEBUG(dbgs() << "Optimized lowered selection DAG: "
                << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                << "'\n";
         CurDAG->dump());
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
   if (ViewLegalizeTypesDAGs && MatchFilterBB)
     CurDAG->viewGraph("legalize-types input for " + BlockName);
 
   bool Changed;
   {
     NamedRegionTimer T("legalize_types", "Type Legalization", GroupName,
                        GroupDescription, TimePassesIsEnabled);
     Changed = CurDAG->LegalizeTypes();
   }
 
   DEBUG(dbgs() << "Type-legalized selection DAG: "
                << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                << "'\n";
         CurDAG->dump());
 
   // Only allow creation of legal node types.
   CurDAG->NewNodesMustHaveLegalTypes = true;
 
   if (Changed) {
     if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
     {
       NamedRegionTimer T("combine_lt", "DAG Combining after legalize types",
                          GroupName, GroupDescription, TimePassesIsEnabled);
       CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel);
     }
 
     DEBUG(dbgs() << "Optimized type-legalized selection DAG: "
                  << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                  << "'\n";
           CurDAG->dump());
   }
 
   {
     NamedRegionTimer T("legalize_vec", "Vector Legalization", GroupName,
                        GroupDescription, TimePassesIsEnabled);
     Changed = CurDAG->LegalizeVectors();
   }
 
   if (Changed) {
     DEBUG(dbgs() << "Vector-legalized selection DAG: "
                  << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                  << "'\n";
           CurDAG->dump());
 
     {
       NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName,
                          GroupDescription, TimePassesIsEnabled);
       CurDAG->LegalizeTypes();
     }
 
     DEBUG(dbgs() << "Vector/type-legalized selection DAG: "
                  << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                  << "'\n";
           CurDAG->dump());
 
     if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
     {
       NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors",
                          GroupName, GroupDescription, TimePassesIsEnabled);
       CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel);
     }
 
     DEBUG(dbgs() << "Optimized vector-legalized selection DAG: "
                  << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                  << "'\n";
           CurDAG->dump());
   }
 
   if (ViewLegalizeDAGs && MatchFilterBB)
     CurDAG->viewGraph("legalize input for " + BlockName);
 
   {
     NamedRegionTimer T("legalize", "DAG Legalization", GroupName,
                        GroupDescription, TimePassesIsEnabled);
     CurDAG->Legalize();
   }
 
   DEBUG(dbgs() << "Legalized selection DAG: "
                << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                << "'\n";
         CurDAG->dump());
 
   if (ViewDAGCombine2 && MatchFilterBB)
     CurDAG->viewGraph("dag-combine2 input for " + BlockName);
 
   // Run the DAG combiner in post-legalize mode.
   {
     NamedRegionTimer T("combine2", "DAG Combining 2", GroupName,
                        GroupDescription, TimePassesIsEnabled);
     CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel);
   }
 
   DEBUG(dbgs() << "Optimized legalized selection DAG: "
                << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                << "'\n";
         CurDAG->dump());
 
   if (OptLevel != CodeGenOpt::None)
     ComputeLiveOutVRegInfo();
 
   if (ViewISelDAGs && MatchFilterBB)
     CurDAG->viewGraph("isel input for " + BlockName);
 
   // Third, instruction select all of the operations to machine code, adding the
   // code to the MachineBasicBlock.
   {
     NamedRegionTimer T("isel", "Instruction Selection", GroupName,
                        GroupDescription, TimePassesIsEnabled);
     DoInstructionSelection();
   }
 
   DEBUG(dbgs() << "Selected selection DAG: "
                << printMBBReference(*FuncInfo->MBB) << " '" << BlockName
                << "'\n";
         CurDAG->dump());
 
   if (ViewSchedDAGs && MatchFilterBB)
     CurDAG->viewGraph("scheduler input for " + BlockName);
 
   // Schedule machine code.
   ScheduleDAGSDNodes *Scheduler = CreateScheduler();
   {
     NamedRegionTimer T("sched", "Instruction Scheduling", GroupName,
                        GroupDescription, TimePassesIsEnabled);
     Scheduler->Run(CurDAG, FuncInfo->MBB);
   }
 
   if (ViewSUnitDAGs && MatchFilterBB)
     Scheduler->viewGraph();
 
   // Emit machine code to BB.  This can change 'BB' to the last block being
   // inserted into.
   MachineBasicBlock *FirstMBB = FuncInfo->MBB, *LastMBB;
   {
     NamedRegionTimer T("emit", "Instruction Creation", GroupName,
                        GroupDescription, TimePassesIsEnabled);
 
     // FuncInfo->InsertPt is passed by reference and set to the end of the
     // scheduled instructions.
     LastMBB = FuncInfo->MBB = Scheduler->EmitSchedule(FuncInfo->InsertPt);
   }
 
   // If the block was split, make sure we update any references that are used to
   // update PHI nodes later on.
   if (FirstMBB != LastMBB)
     SDB->UpdateSplitBlock(FirstMBB, LastMBB);
 
   // Free the scheduler state.
   {
     NamedRegionTimer T("cleanup", "Instruction Scheduling Cleanup", GroupName,
                        GroupDescription, TimePassesIsEnabled);
     delete Scheduler;
   }
 
   // Free the SelectionDAG state, now that we're finished with it.
   CurDAG->clear();
 }
 
 namespace {
 
 /// ISelUpdater - helper class to handle updates of the instruction selection
 /// graph.
 class ISelUpdater : public SelectionDAG::DAGUpdateListener {
   SelectionDAG::allnodes_iterator &ISelPosition;
 
 public:
   ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
     : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
 
   /// NodeDeleted - Handle nodes deleted from the graph. If the node being
   /// deleted is the current ISelPosition node, update ISelPosition.
   ///
   void NodeDeleted(SDNode *N, SDNode *E) override {
     if (ISelPosition == SelectionDAG::allnodes_iterator(N))
       ++ISelPosition;
   }
 };
 
 } // end anonymous namespace
 
 void SelectionDAGISel::DoInstructionSelection() {
   DEBUG(dbgs() << "===== Instruction selection begins: "
                << printMBBReference(*FuncInfo->MBB) << " '"
                << FuncInfo->MBB->getName() << "'\n");
 
   PreprocessISelDAG();
 
   // Select target instructions for the DAG.
   {
     // Number all nodes with a topological order and set DAGSize.
     DAGSize = CurDAG->AssignTopologicalOrder();
 
     // Create a dummy node (which is not added to allnodes), that adds
     // a reference to the root node, preventing it from being deleted,
     // and tracking any changes of the root.
     HandleSDNode Dummy(CurDAG->getRoot());
     SelectionDAG::allnodes_iterator ISelPosition (CurDAG->getRoot().getNode());
     ++ISelPosition;
 
     // Make sure that ISelPosition gets properly updated when nodes are deleted
     // in calls made from this function.
     ISelUpdater ISU(*CurDAG, ISelPosition);
 
     // The AllNodes list is now topological-sorted. Visit the
     // nodes by starting at the end of the list (the root of the
     // graph) and preceding back toward the beginning (the entry
     // node).
     while (ISelPosition != CurDAG->allnodes_begin()) {
       SDNode *Node = &*--ISelPosition;
       // Skip dead nodes. DAGCombiner is expected to eliminate all dead nodes,
       // but there are currently some corner cases that it misses. Also, this
       // makes it theoretically possible to disable the DAGCombiner.
       if (Node->use_empty())
         continue;
 
       // When we are using non-default rounding modes or FP exception behavior
       // FP operations are represented by StrictFP pseudo-operations.  They
       // need to be simplified here so that the target-specific instruction
       // selectors know how to handle them.
       //
       // If the current node is a strict FP pseudo-op, the isStrictFPOp()
       // function will provide the corresponding normal FP opcode to which the
       // node should be mutated.
       //
       // FIXME: The backends need a way to handle FP constraints.
       if (Node->isStrictFPOpcode())
         Node = CurDAG->mutateStrictFPToFP(Node);
 
       Select(Node);
     }
 
     CurDAG->setRoot(Dummy.getValue());
   }
 
   DEBUG(dbgs() << "===== Instruction selection ends:\n");
 
   PostprocessISelDAG();
 }
 
 static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) {
   for (const User *U : CPI->users()) {
     if (const IntrinsicInst *EHPtrCall = dyn_cast<IntrinsicInst>(U)) {
       Intrinsic::ID IID = EHPtrCall->getIntrinsicID();
       if (IID == Intrinsic::eh_exceptionpointer ||
           IID == Intrinsic::eh_exceptioncode)
         return true;
     }
   }
   return false;
 }
 
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
 bool SelectionDAGISel::PrepareEHLandingPad() {
   MachineBasicBlock *MBB = FuncInfo->MBB;
   const Constant *PersonalityFn = FuncInfo->Fn->getPersonalityFn();
   const BasicBlock *LLVMBB = MBB->getBasicBlock();
   const TargetRegisterClass *PtrRC =
       TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
 
   // Catchpads have one live-in register, which typically holds the exception
   // pointer or code.
   if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
     if (hasExceptionPointerOrCodeUser(CPI)) {
       // Get or create the virtual register to hold the pointer or code.  Mark
       // the live in physreg and copy into the vreg.
       MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
       assert(EHPhysReg && "target lacks exception pointer register");
       MBB->addLiveIn(EHPhysReg);
       unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
       BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
               TII->get(TargetOpcode::COPY), VReg)
           .addReg(EHPhysReg, RegState::Kill);
     }
     return true;
   }
 
   if (!LLVMBB->isLandingPad())
     return true;
 
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->addLandingPad(MBB);
 
   // Assign the call site to the landing pad's begin label.
   MF->setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
 
   const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
   // Mark exception register as live in.
   if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
     FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
 
   // Mark exception selector register as live in.
   if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
     FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
 
   return true;
 }
 
 /// isFoldedOrDeadInstruction - Return true if the specified instruction is
 /// side-effect free and is either dead or folded into a generated instruction.
 /// Return false if it needs to be emitted.
 static bool isFoldedOrDeadInstruction(const Instruction *I,
                                       FunctionLoweringInfo *FuncInfo) {
   return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
          !isa<TerminatorInst>(I) &&    // Terminators aren't folded.
          !isa<DbgInfoIntrinsic>(I) &&  // Debug instructions aren't folded.
          !I->isEHPad() &&              // EH pad instructions aren't folded.
          !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
 }
 
 /// Set up SwiftErrorVals by going through the function. If the function has
 /// swifterror argument, it will be the first entry.
 static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
                                 FunctionLoweringInfo *FuncInfo) {
   if (!TLI->supportSwiftError())
     return;
 
   FuncInfo->SwiftErrorVals.clear();
   FuncInfo->SwiftErrorVRegDefMap.clear();
   FuncInfo->SwiftErrorVRegUpwardsUse.clear();
   FuncInfo->SwiftErrorVRegDefUses.clear();
   FuncInfo->SwiftErrorArg = nullptr;
 
   // Check if function has a swifterror argument.
   bool HaveSeenSwiftErrorArg = false;
   for (Function::const_arg_iterator AI = Fn.arg_begin(), AE = Fn.arg_end();
        AI != AE; ++AI)
     if (AI->hasSwiftErrorAttr()) {
       assert(!HaveSeenSwiftErrorArg &&
              "Must have only one swifterror parameter");
       (void)HaveSeenSwiftErrorArg; // silence warning.
       HaveSeenSwiftErrorArg = true;
       FuncInfo->SwiftErrorArg = &*AI;
       FuncInfo->SwiftErrorVals.push_back(&*AI);
     }
 
   for (const auto &LLVMBB : Fn)
     for (const auto &Inst : LLVMBB) {
       if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&Inst))
         if (Alloca->isSwiftError())
           FuncInfo->SwiftErrorVals.push_back(Alloca);
     }
 }
 
 static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
                                                 FastISel *FastIS,
                                                 const TargetLowering *TLI,
                                                 const TargetInstrInfo *TII,
                                                 SelectionDAGBuilder *SDB) {
   if (!TLI->supportSwiftError())
     return;
 
   // We only need to do this when we have swifterror parameter or swifterror
   // alloc.
   if (FuncInfo->SwiftErrorVals.empty())
     return;
 
   assert(FuncInfo->MBB == &*FuncInfo->MF->begin() &&
          "expected to insert into entry block");
   auto &DL = FuncInfo->MF->getDataLayout();
   auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
   for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) {
     // We will always generate a copy from the argument. It is always used at
     // least by the 'return' of the swifterror.
     if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal)
       continue;
     unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
     // Assign Undef to Vreg. We construct MI directly to make sure it works
     // with FastISel.
     BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(),
             SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
             VReg);
 
     // Keep FastIS informed about the value we just inserted.
     if (FastIS)
       FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
 
     FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg);
   }
 }
 
 /// Collect llvm.dbg.declare information. This is done after argument lowering
 /// in case the declarations refer to arguments.
 static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) {
   MachineFunction *MF = FuncInfo->MF;
   const DataLayout &DL = MF->getDataLayout();
   for (const BasicBlock &BB : *FuncInfo->Fn) {
     for (const Instruction &I : BB) {
       const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(&I);
       if (!DI)
         continue;
 
       assert(DI->getVariable() && "Missing variable");
       assert(DI->getDebugLoc() && "Missing location");
       const Value *Address = DI->getAddress();
       if (!Address)
         continue;
 
       // Look through casts and constant offset GEPs. These mostly come from
       // inalloca.
       APInt Offset(DL.getTypeSizeInBits(Address->getType()), 0);
       Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
 
       // Check if the variable is a static alloca or a byval or inalloca
       // argument passed in memory. If it is not, then we will ignore this
       // intrinsic and handle this during isel like dbg.value.
       int FI = std::numeric_limits<int>::max();
       if (const auto *AI = dyn_cast<AllocaInst>(Address)) {
         auto SI = FuncInfo->StaticAllocaMap.find(AI);
         if (SI != FuncInfo->StaticAllocaMap.end())
           FI = SI->second;
       } else if (const auto *Arg = dyn_cast<Argument>(Address))
         FI = FuncInfo->getArgumentFrameIndex(Arg);
 
       if (FI == std::numeric_limits<int>::max())
         continue;
 
       DIExpression *Expr = DI->getExpression();
       if (Offset.getBoolValue())
         Expr = DIExpression::prepend(Expr, DIExpression::NoDeref,
                                      Offset.getZExtValue());
       MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc());
     }
   }
 }
 
 /// Propagate swifterror values through the machine function CFG.
 static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
   auto *TLI = FuncInfo->TLI;
   if (!TLI->supportSwiftError())
     return;
 
   // We only need to do this when we have swifterror parameter or swifterror
   // alloc.
   if (FuncInfo->SwiftErrorVals.empty())
     return;
 
   // For each machine basic block in reverse post order.
   ReversePostOrderTraversal<MachineFunction *> RPOT(FuncInfo->MF);
   for (MachineBasicBlock *MBB : RPOT) {
     // For each swifterror value in the function.
     for(const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) {
       auto Key = std::make_pair(MBB, SwiftErrorVal);
       auto UUseIt = FuncInfo->SwiftErrorVRegUpwardsUse.find(Key);
       auto VRegDefIt = FuncInfo->SwiftErrorVRegDefMap.find(Key);
       bool UpwardsUse = UUseIt != FuncInfo->SwiftErrorVRegUpwardsUse.end();
       unsigned UUseVReg = UpwardsUse ? UUseIt->second : 0;
       bool DownwardDef = VRegDefIt != FuncInfo->SwiftErrorVRegDefMap.end();
       assert(!(UpwardsUse && !DownwardDef) &&
              "We can't have an upwards use but no downwards def");
 
       // If there is no upwards exposed use and an entry for the swifterror in
       // the def map for this value we don't need to do anything: We already
       // have a downward def for this basic block.
       if (!UpwardsUse && DownwardDef)
         continue;
 
       // Otherwise we either have an upwards exposed use vreg that we need to
       // materialize or need to forward the downward def from predecessors.
 
       // Check whether we have a single vreg def from all predecessors.
       // Otherwise we need a phi.
       SmallVector<std::pair<MachineBasicBlock *, unsigned>, 4> VRegs;
       SmallSet<const MachineBasicBlock*, 8> Visited;
       for (auto *Pred : MBB->predecessors()) {
         if (!Visited.insert(Pred).second)
           continue;
         VRegs.push_back(std::make_pair(
             Pred, FuncInfo->getOrCreateSwiftErrorVReg(Pred, SwiftErrorVal)));
         if (Pred != MBB)
           continue;
         // We have a self-edge.
         // If there was no upwards use in this basic block there is now one: the
         // phi needs to use it self.
         if (!UpwardsUse) {
           UpwardsUse = true;
           UUseIt = FuncInfo->SwiftErrorVRegUpwardsUse.find(Key);
           assert(UUseIt != FuncInfo->SwiftErrorVRegUpwardsUse.end());
           UUseVReg = UUseIt->second;
         }
       }
 
       // We need a phi node if we have more than one predecessor with different
       // downward defs.
       bool needPHI =
           VRegs.size() >= 1 &&
           std::find_if(
               VRegs.begin(), VRegs.end(),
               [&](const std::pair<const MachineBasicBlock *, unsigned> &V)
                   -> bool { return V.second != VRegs[0].second; }) !=
               VRegs.end();
 
       // If there is no upwards exposed used and we don't need a phi just
       // forward the swifterror vreg from the predecessor(s).
       if (!UpwardsUse && !needPHI) {
         assert(!VRegs.empty() &&
                "No predecessors? The entry block should bail out earlier");
         // Just forward the swifterror vreg from the predecessor(s).
         FuncInfo->setCurrentSwiftErrorVReg(MBB, SwiftErrorVal, VRegs[0].second);
         continue;
       }
 
       auto DLoc = isa<Instruction>(SwiftErrorVal)
                       ? dyn_cast<Instruction>(SwiftErrorVal)->getDebugLoc()
                       : DebugLoc();
       const auto *TII = FuncInfo->MF->getSubtarget().getInstrInfo();
 
       // If we don't need a phi create a copy to the upward exposed vreg.
       if (!needPHI) {
         assert(UpwardsUse);
         assert(!VRegs.empty() &&
                "No predecessors?  Is the Calling Convention correct?");
         unsigned DestReg = UUseVReg;
         BuildMI(*MBB, MBB->getFirstNonPHI(), DLoc, TII->get(TargetOpcode::COPY),
                 DestReg)
             .addReg(VRegs[0].second);
         continue;
       }
 
       // We need a phi: if there is an upwards exposed use we already have a
       // destination virtual register number otherwise we generate a new one.
       auto &DL = FuncInfo->MF->getDataLayout();
       auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
       unsigned PHIVReg =
           UpwardsUse ? UUseVReg
                      : FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
       MachineInstrBuilder SwiftErrorPHI =
           BuildMI(*MBB, MBB->getFirstNonPHI(), DLoc,
                   TII->get(TargetOpcode::PHI), PHIVReg);
       for (auto BBRegPair : VRegs) {
         SwiftErrorPHI.addReg(BBRegPair.second).addMBB(BBRegPair.first);
       }
 
       // We did not have a definition in this block before: store the phi's vreg
       // as this block downward exposed def.
       if (!UpwardsUse)
         FuncInfo->setCurrentSwiftErrorVReg(MBB, SwiftErrorVal, PHIVReg);
     }
   }
 }
 
 static void preassignSwiftErrorRegs(const TargetLowering *TLI,
                                     FunctionLoweringInfo *FuncInfo,
                                     BasicBlock::const_iterator Begin,
                                     BasicBlock::const_iterator End) {
   if (!TLI->supportSwiftError() || FuncInfo->SwiftErrorVals.empty())
     return;
 
   // Iterator over instructions and assign vregs to swifterror defs and uses.
   for (auto It = Begin; It != End; ++It) {
     ImmutableCallSite CS(&*It);
     if (CS) {
       // A call-site with a swifterror argument is both use and def.
       const Value *SwiftErrorAddr = nullptr;
       for (auto &Arg : CS.args()) {
         if (!Arg->isSwiftError())
           continue;
         // Use of swifterror.
         assert(!SwiftErrorAddr && "Cannot have multiple swifterror arguments");
         SwiftErrorAddr = &*Arg;
         assert(SwiftErrorAddr->isSwiftError() &&
                "Must have a swifterror value argument");
         unsigned VReg; bool CreatedReg;
         std::tie(VReg, CreatedReg) = FuncInfo->getOrCreateSwiftErrorVRegUseAt(
           &*It, FuncInfo->MBB, SwiftErrorAddr);
         assert(CreatedReg);
       }
       if (!SwiftErrorAddr)
         continue;
 
       // Def of swifterror.
       unsigned VReg; bool CreatedReg;
       std::tie(VReg, CreatedReg) =
           FuncInfo->getOrCreateSwiftErrorVRegDefAt(&*It);
       assert(CreatedReg);
       FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorAddr, VReg);
 
     // A load is a use.
     } else if (const LoadInst *LI = dyn_cast<const LoadInst>(&*It)) {
       const Value *V = LI->getOperand(0);
       if (!V->isSwiftError())
         continue;
 
       unsigned VReg; bool CreatedReg;
       std::tie(VReg, CreatedReg) =
           FuncInfo->getOrCreateSwiftErrorVRegUseAt(LI, FuncInfo->MBB, V);
       assert(CreatedReg);
 
     // A store is a def.
     } else if (const StoreInst *SI = dyn_cast<const StoreInst>(&*It)) {
       const Value *SwiftErrorAddr = SI->getOperand(1);
       if (!SwiftErrorAddr->isSwiftError())
         continue;
 
       // Def of swifterror.
       unsigned VReg; bool CreatedReg;
       std::tie(VReg, CreatedReg) =
           FuncInfo->getOrCreateSwiftErrorVRegDefAt(&*It);
       assert(CreatedReg);
       FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorAddr, VReg);
 
     // A return in a swiferror returning function is a use.
     } else if (const ReturnInst *R = dyn_cast<const ReturnInst>(&*It)) {
       const Function *F = R->getParent()->getParent();
       if(!F->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
         continue;
 
       unsigned VReg; bool CreatedReg;
       std::tie(VReg, CreatedReg) = FuncInfo->getOrCreateSwiftErrorVRegUseAt(
           R, FuncInfo->MBB, FuncInfo->SwiftErrorArg);
       assert(CreatedReg);
     }
   }
 }
 
 void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   FastISelFailed = false;
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = nullptr;
-  if (TM.Options.EnableFastISel)
+  if (TM.Options.EnableFastISel) {
+    DEBUG(dbgs() << "Enabling fast-isel\n");
     FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
+  }
 
   setupSwiftErrorVals(Fn, TLI, FuncInfo);
 
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
 
   // Lower arguments up front. An RPO iteration always visits the entry block
   // first.
   assert(*RPOT.begin() == &Fn.getEntryBlock());
   ++NumEntryBlocks;
 
   // Set up FuncInfo for ISel. Entry blocks never have PHIs.
   FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()];
   FuncInfo->InsertPt = FuncInfo->MBB->begin();
 
   if (!FastIS) {
     LowerArguments(Fn);
   } else {
     // See if fast isel can lower the arguments.
     FastIS->startNewBlock();
     if (!FastIS->lowerArguments()) {
       FastISelFailed = true;
       // Fast isel failed to lower these arguments
       ++NumFastIselFailLowerArguments;
 
       OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
                                  Fn.getSubprogram(),
                                  &Fn.getEntryBlock());
       R << "FastISel didn't lower all arguments: "
         << ore::NV("Prototype", Fn.getType());
       reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1);
 
       // Use SelectionDAG argument lowering
       LowerArguments(Fn);
       CurDAG->setRoot(SDB->getControlRoot());
       SDB->clear();
       CodeGenAndEmitDAG();
     }
 
     // If we inserted any instructions at the beginning, make a note of
     // where they are, so we can be sure to emit subsequent instructions
     // after them.
     if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
       FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
     else
       FastIS->setLastLocalValue(nullptr);
   }
   createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB);
 
   processDbgDeclares(FuncInfo);
 
   // Iterate over all basic blocks in the function.
   for (const BasicBlock *LLVMBB : RPOT) {
     if (OptLevel != CodeGenOpt::None) {
       bool AllPredsVisited = true;
       for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
            PI != PE; ++PI) {
         if (!FuncInfo->VisitedBBs.count(*PI)) {
           AllPredsVisited = false;
           break;
         }
       }
 
       if (AllPredsVisited) {
         for (const PHINode &PN : LLVMBB->phis())
           FuncInfo->ComputePHILiveOutRegInfo(&PN);
       } else {
         for (const PHINode &PN : LLVMBB->phis())
           FuncInfo->InvalidatePHILiveOutRegInfo(&PN);
       }
 
       FuncInfo->VisitedBBs.insert(LLVMBB);
     }
 
     BasicBlock::const_iterator const Begin =
         LLVMBB->getFirstNonPHI()->getIterator();
     BasicBlock::const_iterator const End = LLVMBB->end();
     BasicBlock::const_iterator BI = End;
 
     FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
     if (!FuncInfo->MBB)
       continue; // Some blocks like catchpads have no code or MBB.
 
     // Insert new instructions after any phi or argument setup code.
     FuncInfo->InsertPt = FuncInfo->MBB->end();
 
     // Setup an EH landing-pad block.
     FuncInfo->ExceptionPointerVirtReg = 0;
     FuncInfo->ExceptionSelectorVirtReg = 0;
     if (LLVMBB->isEHPad())
       if (!PrepareEHLandingPad())
         continue;
 
     // Before doing SelectionDAG ISel, see if FastISel has been requested.
     if (FastIS) {
       if (LLVMBB != &Fn.getEntryBlock())
         FastIS->startNewBlock();
 
       unsigned NumFastIselRemaining = std::distance(Begin, End);
 
       // Pre-assign swifterror vregs.
       preassignSwiftErrorRegs(TLI, FuncInfo, Begin, End);
 
       // Do FastISel on as many instructions as possible.
       for (; BI != Begin; --BI) {
         const Instruction *Inst = &*std::prev(BI);
 
         // If we no longer require this instruction, skip it.
         if (isFoldedOrDeadInstruction(Inst, FuncInfo) ||
             ElidedArgCopyInstrs.count(Inst)) {
           --NumFastIselRemaining;
           continue;
         }
 
         // Bottom-up: reset the insert pos at the top, after any local-value
         // instructions.
         FastIS->recomputeInsertPt();
 
         // Try to select the instruction with FastISel.
         if (FastIS->selectInstruction(Inst)) {
           --NumFastIselRemaining;
           ++NumFastIselSuccess;
           // If fast isel succeeded, skip over all the folded instructions, and
           // then see if there is a load right before the selected instructions.
           // Try to fold the load if so.
           const Instruction *BeforeInst = Inst;
           while (BeforeInst != &*Begin) {
             BeforeInst = &*std::prev(BasicBlock::const_iterator(BeforeInst));
             if (!isFoldedOrDeadInstruction(BeforeInst, FuncInfo))
               break;
           }
           if (BeforeInst != Inst && isa<LoadInst>(BeforeInst) &&
               BeforeInst->hasOneUse() &&
               FastIS->tryToFoldLoad(cast<LoadInst>(BeforeInst), Inst)) {
             // If we succeeded, don't re-select the load.
             BI = std::next(BasicBlock::const_iterator(BeforeInst));
             --NumFastIselRemaining;
             ++NumFastIselSuccess;
           }
           continue;
         }
 
         FastISelFailed = true;
 
         // Then handle certain instructions as single-LLVM-Instruction blocks.
         // We cannot separate out GCrelocates to their own blocks since we need
         // to keep track of gc-relocates for a particular gc-statepoint. This is
         // done by SelectionDAGBuilder::LowerAsSTATEPOINT, called before
         // visitGCRelocate.
         if (isa<CallInst>(Inst) && !isStatepoint(Inst) && !isGCRelocate(Inst)) {
           OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
                                      Inst->getDebugLoc(), LLVMBB);
 
           R << "FastISel missed call";
 
           if (R.isEnabled() || EnableFastISelAbort) {
             std::string InstStrStorage;
             raw_string_ostream InstStr(InstStrStorage);
             InstStr << *Inst;
 
             R << ": " << InstStr.str();
           }
 
           reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2);
 
           if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&
               !Inst->use_empty()) {
             unsigned &R = FuncInfo->ValueMap[Inst];
             if (!R)
               R = FuncInfo->CreateRegs(Inst->getType());
           }
 
           bool HadTailCall = false;
           MachineBasicBlock::iterator SavedInsertPt = FuncInfo->InsertPt;
           SelectBasicBlock(Inst->getIterator(), BI, HadTailCall);
 
           // If the call was emitted as a tail call, we're done with the block.
           // We also need to delete any previously emitted instructions.
           if (HadTailCall) {
             FastIS->removeDeadCode(SavedInsertPt, FuncInfo->MBB->end());
             --BI;
             break;
           }
 
           // Recompute NumFastIselRemaining as Selection DAG instruction
           // selection may have handled the call, input args, etc.
           unsigned RemainingNow = std::distance(Begin, BI);
           NumFastIselFailures += NumFastIselRemaining - RemainingNow;
           NumFastIselRemaining = RemainingNow;
           continue;
         }
 
         OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
                                    Inst->getDebugLoc(), LLVMBB);
 
         bool ShouldAbort = EnableFastISelAbort;
         if (isa<TerminatorInst>(Inst)) {
           // Use a different message for terminator misses.
           R << "FastISel missed terminator";
           // Don't abort for terminator unless the level is really high
           ShouldAbort = (EnableFastISelAbort > 2);
         } else {
           R << "FastISel missed";
         }
 
         if (R.isEnabled() || EnableFastISelAbort) {
           std::string InstStrStorage;
           raw_string_ostream InstStr(InstStrStorage);
           InstStr << *Inst;
           R << ": " << InstStr.str();
         }
 
         reportFastISelFailure(*MF, *ORE, R, ShouldAbort);
 
         NumFastIselFailures += NumFastIselRemaining;
         break;
       }
 
       FastIS->recomputeInsertPt();
     }
 
     if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) {
       bool FunctionBasedInstrumentation =
           TLI->getSSPStackGuardCheck(*Fn.getParent());
       SDB->SPDescriptor.initialize(LLVMBB, FuncInfo->MBBMap[LLVMBB],
                                    FunctionBasedInstrumentation);
     }
 
     if (Begin != BI)
       ++NumDAGBlocks;
     else
       ++NumFastIselBlocks;
 
     if (Begin != BI) {
       // Run SelectionDAG instruction selection on the remainder of the block
       // not handled by FastISel. If FastISel is not run, this is the entire
       // block.
       bool HadTailCall;
       SelectBasicBlock(Begin, BI, HadTailCall);
 
       // But if FastISel was run, we already selected some of the block.
       // If we emitted a tail-call, we need to delete any previously emitted
       // instruction that follows it.
       if (HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end())
         FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end());
     }
 
     FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
     ElidedArgCopyInstrs.clear();
   }
 
   propagateSwiftErrorVRegs(FuncInfo);
 
   delete FastIS;
   SDB->clearDanglingDebugInfo();
   SDB->SPDescriptor.resetPerFunctionState();
 }
 
 /// Given that the input MI is before a partial terminator sequence TSeq, return
 /// true if M + TSeq also a partial terminator sequence.
 ///
 /// A Terminator sequence is a sequence of MachineInstrs which at this point in
 /// lowering copy vregs into physical registers, which are then passed into
 /// terminator instructors so we can satisfy ABI constraints. A partial
 /// terminator sequence is an improper subset of a terminator sequence (i.e. it
 /// may be the whole terminator sequence).
 static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
   // If we do not have a copy or an implicit def, we return true if and only if
   // MI is a debug value.
   if (!MI.isCopy() && !MI.isImplicitDef())
     // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
     // physical registers if there is debug info associated with the terminator
     // of our mbb. We want to include said debug info in our terminator
     // sequence, so we return true in that case.
     return MI.isDebugValue();
 
   // We have left the terminator sequence if we are not doing one of the
   // following:
   //
   // 1. Copying a vreg into a physical register.
   // 2. Copying a vreg into a vreg.
   // 3. Defining a register via an implicit def.
 
   // OPI should always be a register definition...
   MachineInstr::const_mop_iterator OPI = MI.operands_begin();
   if (!OPI->isReg() || !OPI->isDef())
     return false;
 
   // Defining any register via an implicit def is always ok.
   if (MI.isImplicitDef())
     return true;
 
   // Grab the copy source...
   MachineInstr::const_mop_iterator OPI2 = OPI;
   ++OPI2;
   assert(OPI2 != MI.operands_end()
          && "Should have a copy implying we should have 2 arguments.");
 
   // Make sure that the copy dest is not a vreg when the copy source is a
   // physical register.
   if (!OPI2->isReg() ||
       (!TargetRegisterInfo::isPhysicalRegister(OPI->getReg()) &&
        TargetRegisterInfo::isPhysicalRegister(OPI2->getReg())))
     return false;
 
   return true;
 }
 
 /// Find the split point at which to splice the end of BB into its success stack
 /// protector check machine basic block.
 ///
 /// On many platforms, due to ABI constraints, terminators, even before register
 /// allocation, use physical registers. This creates an issue for us since
 /// physical registers at this point can not travel across basic
 /// blocks. Luckily, selectiondag always moves physical registers into vregs
 /// when they enter functions and moves them through a sequence of copies back
 /// into the physical registers right before the terminator creating a
 /// ``Terminator Sequence''. This function is searching for the beginning of the
 /// terminator sequence so that we can ensure that we splice off not just the
 /// terminator, but additionally the copies that move the vregs into the
 /// physical registers.
 static MachineBasicBlock::iterator
 FindSplitPointForStackProtector(MachineBasicBlock *BB) {
   MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
   //
   if (SplitPoint == BB->begin())
     return SplitPoint;
 
   MachineBasicBlock::iterator Start = BB->begin();
   MachineBasicBlock::iterator Previous = SplitPoint;
   --Previous;
 
   while (MIIsInTerminatorSequence(*Previous)) {
     SplitPoint = Previous;
     if (Previous == Start)
       break;
     --Previous;
   }
 
   return SplitPoint;
 }
 
 void
 SelectionDAGISel::FinishBasicBlock() {
   DEBUG(dbgs() << "Total amount of phi nodes to update: "
                << FuncInfo->PHINodesToUpdate.size() << "\n";
         for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
           dbgs() << "Node " << i << " : ("
                  << FuncInfo->PHINodesToUpdate[i].first
                  << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
 
   // Next, now that we know what the last MBB the LLVM BB expanded is, update
   // PHI nodes in successors.
   for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
     MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
     assert(PHI->isPHI() &&
            "This is not a machine PHI node that we are updating!");
     if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
       continue;
     PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
   }
 
   // Handle stack protector.
   if (SDB->SPDescriptor.shouldEmitFunctionBasedCheckStackProtector()) {
     // The target provides a guard check function. There is no need to
     // generate error handling code or to split current basic block.
     MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
 
     // Add load and check to the basicblock.
     FuncInfo->MBB = ParentMBB;
     FuncInfo->InsertPt =
         FindSplitPointForStackProtector(ParentMBB);
     SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
     CodeGenAndEmitDAG();
 
     // Clear the Per-BB State.
     SDB->SPDescriptor.resetPerBBState();
   } else if (SDB->SPDescriptor.shouldEmitStackProtector()) {
     MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
     MachineBasicBlock *SuccessMBB = SDB->SPDescriptor.getSuccessMBB();
 
     // Find the split point to split the parent mbb. At the same time copy all
     // physical registers used in the tail of parent mbb into virtual registers
     // before the split point and back into physical registers after the split
     // point. This prevents us needing to deal with Live-ins and many other
     // register allocation issues caused by us splitting the parent mbb. The
     // register allocator will clean up said virtual copies later on.
     MachineBasicBlock::iterator SplitPoint =
         FindSplitPointForStackProtector(ParentMBB);
 
     // Splice the terminator of ParentMBB into SuccessMBB.
     SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
                        SplitPoint,
                        ParentMBB->end());
 
     // Add compare/jump on neq/jump to the parent BB.
     FuncInfo->MBB = ParentMBB;
     FuncInfo->InsertPt = ParentMBB->end();
     SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
     CodeGenAndEmitDAG();
 
     // CodeGen Failure MBB if we have not codegened it yet.
     MachineBasicBlock *FailureMBB = SDB->SPDescriptor.getFailureMBB();
     if (FailureMBB->empty()) {
       FuncInfo->MBB = FailureMBB;
       FuncInfo->InsertPt = FailureMBB->end();
       SDB->visitSPDescriptorFailure(SDB->SPDescriptor);
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
       CodeGenAndEmitDAG();
     }
 
     // Clear the Per-BB State.
     SDB->SPDescriptor.resetPerBBState();
   }
 
   // Lower each BitTestBlock.
   for (auto &BTB : SDB->BitTestCases) {
     // Lower header first, if it wasn't already lowered
     if (!BTB.Emitted) {
       // Set the current basic block to the mbb we wish to insert the code into
       FuncInfo->MBB = BTB.Parent;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
       SDB->visitBitTestHeader(BTB, FuncInfo->MBB);
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
       CodeGenAndEmitDAG();
     }
 
     BranchProbability UnhandledProb = BTB.Prob;
     for (unsigned j = 0, ej = BTB.Cases.size(); j != ej; ++j) {
       UnhandledProb -= BTB.Cases[j].ExtraProb;
       // Set the current basic block to the mbb we wish to insert the code into
       FuncInfo->MBB = BTB.Cases[j].ThisBB;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
 
       // If all cases cover a contiguous range, it is not necessary to jump to
       // the default block after the last bit test fails. This is because the
       // range check during bit test header creation has guaranteed that every
       // case here doesn't go outside the range. In this case, there is no need
       // to perform the last bit test, as it will always be true. Instead, make
       // the second-to-last bit-test fall through to the target of the last bit
       // test, and delete the last bit test.
 
       MachineBasicBlock *NextMBB;
       if (BTB.ContiguousRange && j + 2 == ej) {
         // Second-to-last bit-test with contiguous range: fall through to the
         // target of the final bit test.
         NextMBB = BTB.Cases[j + 1].TargetBB;
       } else if (j + 1 == ej) {
         // For the last bit test, fall through to Default.
         NextMBB = BTB.Default;
       } else {
         // Otherwise, fall through to the next bit test.
         NextMBB = BTB.Cases[j + 1].ThisBB;
       }
 
       SDB->visitBitTestCase(BTB, NextMBB, UnhandledProb, BTB.Reg, BTB.Cases[j],
                             FuncInfo->MBB);
 
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
       CodeGenAndEmitDAG();
 
       if (BTB.ContiguousRange && j + 2 == ej) {
         // Since we're not going to use the final bit test, remove it.
         BTB.Cases.pop_back();
         break;
       }
     }
 
     // Update PHI Nodes
     for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
          pi != pe; ++pi) {
       MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first);
       MachineBasicBlock *PHIBB = PHI->getParent();
       assert(PHI->isPHI() &&
              "This is not a machine PHI node that we are updating!");
       // This is "default" BB. We have two jumps to it. From "header" BB and
       // from last "case" BB, unless the latter was skipped.
       if (PHIBB == BTB.Default) {
         PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(BTB.Parent);
         if (!BTB.ContiguousRange) {
           PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
               .addMBB(BTB.Cases.back().ThisBB);
          }
       }
       // One of "cases" BB.
       for (unsigned j = 0, ej = BTB.Cases.size();
            j != ej; ++j) {
         MachineBasicBlock* cBB = BTB.Cases[j].ThisBB;
         if (cBB->isSuccessor(PHIBB))
           PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(cBB);
       }
     }
   }
   SDB->BitTestCases.clear();
 
   // If the JumpTable record is filled in, then we need to emit a jump table.
   // Updating the PHI nodes is tricky in this case, since we need to determine
   // whether the PHI is a successor of the range check MBB or the jump table MBB
   for (unsigned i = 0, e = SDB->JTCases.size(); i != e; ++i) {
     // Lower header first, if it wasn't already lowered
     if (!SDB->JTCases[i].first.Emitted) {
       // Set the current basic block to the mbb we wish to insert the code into
       FuncInfo->MBB = SDB->JTCases[i].first.HeaderBB;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
       // Emit the code
       SDB->visitJumpTableHeader(SDB->JTCases[i].second, SDB->JTCases[i].first,
                                 FuncInfo->MBB);
       CurDAG->setRoot(SDB->getRoot());
       SDB->clear();
       CodeGenAndEmitDAG();
     }
 
     // Set the current basic block to the mbb we wish to insert the code into
     FuncInfo->MBB = SDB->JTCases[i].second.MBB;
     FuncInfo->InsertPt = FuncInfo->MBB->end();
     // Emit the code
     SDB->visitJumpTable(SDB->JTCases[i].second);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
     CodeGenAndEmitDAG();
 
     // Update PHI Nodes
     for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
          pi != pe; ++pi) {
       MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first);
       MachineBasicBlock *PHIBB = PHI->getParent();
       assert(PHI->isPHI() &&
              "This is not a machine PHI node that we are updating!");
       // "default" BB. We can go there only from header BB.
       if (PHIBB == SDB->JTCases[i].second.Default)
         PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
            .addMBB(SDB->JTCases[i].first.HeaderBB);
       // JT BB. Just iterate over successors here
       if (FuncInfo->MBB->isSuccessor(PHIBB))
         PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(FuncInfo->MBB);
     }
   }
   SDB->JTCases.clear();
 
   // If we generated any switch lowering information, build and codegen any
   // additional DAGs necessary.
   for (unsigned i = 0, e = SDB->SwitchCases.size(); i != e; ++i) {
     // Set the current basic block to the mbb we wish to insert the code into
     FuncInfo->MBB = SDB->SwitchCases[i].ThisBB;
     FuncInfo->InsertPt = FuncInfo->MBB->end();
 
     // Determine the unique successors.
     SmallVector<MachineBasicBlock *, 2> Succs;
     Succs.push_back(SDB->SwitchCases[i].TrueBB);
     if (SDB->SwitchCases[i].TrueBB != SDB->SwitchCases[i].FalseBB)
       Succs.push_back(SDB->SwitchCases[i].FalseBB);
 
     // Emit the code. Note that this could result in FuncInfo->MBB being split.
     SDB->visitSwitchCase(SDB->SwitchCases[i], FuncInfo->MBB);
     CurDAG->setRoot(SDB->getRoot());
     SDB->clear();
     CodeGenAndEmitDAG();
 
     // Remember the last block, now that any splitting is done, for use in
     // populating PHI nodes in successors.
     MachineBasicBlock *ThisBB = FuncInfo->MBB;
 
     // Handle any PHI nodes in successors of this chunk, as if we were coming
     // from the original BB before switch expansion.  Note that PHI nodes can
     // occur multiple times in PHINodesToUpdate.  We have to be very careful to
     // handle them the right number of times.
     for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
       FuncInfo->MBB = Succs[i];
       FuncInfo->InsertPt = FuncInfo->MBB->end();
       // FuncInfo->MBB may have been removed from the CFG if a branch was
       // constant folded.
       if (ThisBB->isSuccessor(FuncInfo->MBB)) {
         for (MachineBasicBlock::iterator
              MBBI = FuncInfo->MBB->begin(), MBBE = FuncInfo->MBB->end();
              MBBI != MBBE && MBBI->isPHI(); ++MBBI) {
           MachineInstrBuilder PHI(*MF, MBBI);
           // This value for this PHI node is recorded in PHINodesToUpdate.
           for (unsigned pn = 0; ; ++pn) {
             assert(pn != FuncInfo->PHINodesToUpdate.size() &&
                    "Didn't find PHI entry!");
             if (FuncInfo->PHINodesToUpdate[pn].first == PHI) {
               PHI.addReg(FuncInfo->PHINodesToUpdate[pn].second).addMBB(ThisBB);
               break;
             }
           }
         }
       }
     }
   }
   SDB->SwitchCases.clear();
 }
 
 /// Create the scheduler. If a specific scheduler was specified
 /// via the SchedulerRegistry, use it, otherwise select the
 /// one preferred by the target.
 ///
 ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {
   return ISHeuristic(this, OptLevel);
 }
 
 //===----------------------------------------------------------------------===//
 // Helper functions used by the generated instruction selector.
 //===----------------------------------------------------------------------===//
 // Calls to these methods are generated by tblgen.
 
 /// CheckAndMask - The isel is trying to match something like (and X, 255).  If
 /// the dag combiner simplified the 255, we still want to match.  RHS is the
 /// actual value in the DAG on the RHS of an AND, and DesiredMaskS is the value
 /// specified in the .td file (e.g. 255).
 bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
                                     int64_t DesiredMaskS) const {
   const APInt &ActualMask = RHS->getAPIntValue();
   const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
 
   // If the actual mask exactly matches, success!
   if (ActualMask == DesiredMask)
     return true;
 
   // If the actual AND mask is allowing unallowed bits, this doesn't match.
   if (ActualMask.intersects(~DesiredMask))
     return false;
 
   // Otherwise, the DAG Combiner may have proven that the value coming in is
   // either already zero or is not demanded.  Check for known zero input bits.
   APInt NeededMask = DesiredMask & ~ActualMask;
   if (CurDAG->MaskedValueIsZero(LHS, NeededMask))
     return true;
 
   // TODO: check to see if missing bits are just not demanded.
 
   // Otherwise, this pattern doesn't match.
   return false;
 }
 
 /// CheckOrMask - The isel is trying to match something like (or X, 255).  If
 /// the dag combiner simplified the 255, we still want to match.  RHS is the
 /// actual value in the DAG on the RHS of an OR, and DesiredMaskS is the value
 /// specified in the .td file (e.g. 255).
 bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
                                    int64_t DesiredMaskS) const {
   const APInt &ActualMask = RHS->getAPIntValue();
   const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
 
   // If the actual mask exactly matches, success!
   if (ActualMask == DesiredMask)
     return true;
 
   // If the actual AND mask is allowing unallowed bits, this doesn't match.
   if (ActualMask.intersects(~DesiredMask))
     return false;
 
   // Otherwise, the DAG Combiner may have proven that the value coming in is
   // either already zero or is not demanded.  Check for known zero input bits.
   APInt NeededMask = DesiredMask & ~ActualMask;
 
   KnownBits Known;
   CurDAG->computeKnownBits(LHS, Known);
 
   // If all the missing bits in the or are already known to be set, match!
   if (NeededMask.isSubsetOf(Known.One))
     return true;
 
   // TODO: check to see if missing bits are just not demanded.
 
   // Otherwise, this pattern doesn't match.
   return false;
 }
 
 /// SelectInlineAsmMemoryOperands - Calls to this are automatically generated
 /// by tblgen.  Others should not call it.
 void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops,
                                                      const SDLoc &DL) {
   std::vector<SDValue> InOps;
   std::swap(InOps, Ops);
 
   Ops.push_back(InOps[InlineAsm::Op_InputChain]); // 0
   Ops.push_back(InOps[InlineAsm::Op_AsmString]);  // 1
   Ops.push_back(InOps[InlineAsm::Op_MDNode]);     // 2, !srcloc
   Ops.push_back(InOps[InlineAsm::Op_ExtraInfo]);  // 3 (SideEffect, AlignStack)
 
   unsigned i = InlineAsm::Op_FirstOperand, e = InOps.size();
   if (InOps[e-1].getValueType() == MVT::Glue)
     --e;  // Don't process a glue operand if it is here.
 
   while (i != e) {
     unsigned Flags = cast<ConstantSDNode>(InOps[i])->getZExtValue();
     if (!InlineAsm::isMemKind(Flags)) {
       // Just skip over this operand, copying the operands verbatim.
       Ops.insert(Ops.end(), InOps.begin()+i,
                  InOps.begin()+i+InlineAsm::getNumOperandRegisters(Flags) + 1);
       i += InlineAsm::getNumOperandRegisters(Flags) + 1;
     } else {
       assert(InlineAsm::getNumOperandRegisters(Flags) == 1 &&
              "Memory operand with multiple values?");
 
       unsigned TiedToOperand;
       if (InlineAsm::isUseOperandTiedToDef(Flags, TiedToOperand)) {
         // We need the constraint ID from the operand this is tied to.
         unsigned CurOp = InlineAsm::Op_FirstOperand;
         Flags = cast<ConstantSDNode>(InOps[CurOp])->getZExtValue();
         for (; TiedToOperand; --TiedToOperand) {
           CurOp += InlineAsm::getNumOperandRegisters(Flags)+1;
           Flags = cast<ConstantSDNode>(InOps[CurOp])->getZExtValue();
         }
       }
 
       // Otherwise, this is a memory operand.  Ask the target to select it.
       std::vector<SDValue> SelOps;
       unsigned ConstraintID = InlineAsm::getMemoryConstraintID(Flags);
       if (SelectInlineAsmMemoryOperand(InOps[i+1], ConstraintID, SelOps))
         report_fatal_error("Could not match memory address.  Inline asm"
                            " failure!");
 
       // Add this to the output node.
       unsigned NewFlags =
         InlineAsm::getFlagWord(InlineAsm::Kind_Mem, SelOps.size());
       NewFlags = InlineAsm::getFlagWordForMem(NewFlags, ConstraintID);
       Ops.push_back(CurDAG->getTargetConstant(NewFlags, DL, MVT::i32));
       Ops.insert(Ops.end(), SelOps.begin(), SelOps.end());
       i += 2;
     }
   }
 
   // Add the glue input back if present.
   if (e != InOps.size())
     Ops.push_back(InOps.back());
 }
 
 /// findGlueUse - Return use of MVT::Glue value produced by the specified
 /// SDNode.
 ///
 static SDNode *findGlueUse(SDNode *N) {
   unsigned FlagResNo = N->getNumValues()-1;
   for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
     SDUse &Use = I.getUse();
     if (Use.getResNo() == FlagResNo)
       return Use.getUser();
   }
   return nullptr;
 }
 
 /// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
 /// This function iteratively traverses up the operand chain, ignoring
 /// certain nodes.
 static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
                           SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
                           bool IgnoreChains) {
   // The NodeID's are given uniques ID's where a node ID is guaranteed to be
   // greater than all of its (recursive) operands.  If we scan to a point where
   // 'use' is smaller than the node we're scanning for, then we know we will
   // never find it.
   //
   // The Use may be -1 (unassigned) if it is a newly allocated node.  This can
   // happen because we scan down to newly selected nodes in the case of glue
   // uses.
   std::vector<SDNode *> WorkList;
   WorkList.push_back(Use);
 
   while (!WorkList.empty()) {
     Use = WorkList.back();
     WorkList.pop_back();
     if (Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1)
       continue;
 
     // Don't revisit nodes if we already scanned it and didn't fail, we know we
     // won't fail if we scan it again.
     if (!Visited.insert(Use).second)
       continue;
 
     for (const SDValue &Op : Use->op_values()) {
       // Ignore chain uses, they are validated by HandleMergeInputChains.
       if (Op.getValueType() == MVT::Other && IgnoreChains)
         continue;
 
       SDNode *N = Op.getNode();
       if (N == Def) {
         if (Use == ImmedUse || Use == Root)
           continue;  // We are not looking for immediate use.
         assert(N != Root);
         return true;
       }
 
       // Traverse up the operand chain.
       WorkList.push_back(N);
     }
   }
   return false;
 }
 
 /// IsProfitableToFold - Returns true if it's profitable to fold the specific
 /// operand node N of U during instruction selection that starts at Root.
 bool SelectionDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
                                           SDNode *Root) const {
   if (OptLevel == CodeGenOpt::None) return false;
   return N.hasOneUse();
 }
 
 /// IsLegalToFold - Returns true if the specific operand node N of
 /// U can be folded during instruction selection that starts at Root.
 bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
                                      CodeGenOpt::Level OptLevel,
                                      bool IgnoreChains) {
   if (OptLevel == CodeGenOpt::None) return false;
 
   // If Root use can somehow reach N through a path that that doesn't contain
   // U then folding N would create a cycle. e.g. In the following
   // diagram, Root can reach N through X. If N is folded into into Root, then
   // X is both a predecessor and a successor of U.
   //
   //          [N*]           //
   //         ^   ^           //
   //        /     \          //
   //      [U*]    [X]?       //
   //        ^     ^          //
   //         \   /           //
   //          \ /            //
   //         [Root*]         //
   //
   // * indicates nodes to be folded together.
   //
   // If Root produces glue, then it gets (even more) interesting. Since it
   // will be "glued" together with its glue use in the scheduler, we need to
   // check if it might reach N.
   //
   //          [N*]           //
   //         ^   ^           //
   //        /     \          //
   //      [U*]    [X]?       //
   //        ^       ^        //
   //         \       \       //
   //          \      |       //
   //         [Root*] |       //
   //          ^      |       //
   //          f      |       //
   //          |      /       //
   //         [Y]    /        //
   //           ^   /         //
   //           f  /          //
   //           | /           //
   //          [GU]           //
   //
   // If GU (glue use) indirectly reaches N (the load), and Root folds N
   // (call it Fold), then X is a predecessor of GU and a successor of
   // Fold. But since Fold and GU are glued together, this will create
   // a cycle in the scheduling graph.
 
   // If the node has glue, walk down the graph to the "lowest" node in the
   // glueged set.
   EVT VT = Root->getValueType(Root->getNumValues()-1);
   while (VT == MVT::Glue) {
     SDNode *GU = findGlueUse(Root);
     if (!GU)
       break;
     Root = GU;
     VT = Root->getValueType(Root->getNumValues()-1);
 
     // If our query node has a glue result with a use, we've walked up it.  If
     // the user (which has already been selected) has a chain or indirectly uses
     // the chain, our WalkChainUsers predicate will not consider it.  Because of
     // this, we cannot ignore chains in this predicate.
     IgnoreChains = false;
   }
 
   SmallPtrSet<SDNode*, 16> Visited;
   return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
 }
 
 void SelectionDAGISel::Select_INLINEASM(SDNode *N) {
   SDLoc DL(N);
 
   std::vector<SDValue> Ops(N->op_begin(), N->op_end());
   SelectInlineAsmMemoryOperands(Ops, DL);
 
   const EVT VTs[] = {MVT::Other, MVT::Glue};
   SDValue New = CurDAG->getNode(ISD::INLINEASM, DL, VTs, Ops);
   New->setNodeId(-1);
   ReplaceUses(N, New.getNode());
   CurDAG->RemoveDeadNode(N);
 }
 
 void SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   unsigned Reg =
       TLI->getRegisterByName(RegStr->getString().data(), Op->getValueType(0),
                              *CurDAG);
   SDValue New = CurDAG->getCopyFromReg(
                         Op->getOperand(0), dl, Reg, Op->getValueType(0));
   New->setNodeId(-1);
   ReplaceUses(Op, New.getNode());
   CurDAG->RemoveDeadNode(Op);
 }
 
 void SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   unsigned Reg = TLI->getRegisterByName(RegStr->getString().data(),
                                         Op->getOperand(2).getValueType(),
                                         *CurDAG);
   SDValue New = CurDAG->getCopyToReg(
                         Op->getOperand(0), dl, Reg, Op->getOperand(2));
   New->setNodeId(-1);
   ReplaceUses(Op, New.getNode());
   CurDAG->RemoveDeadNode(Op);
 }
 
 void SelectionDAGISel::Select_UNDEF(SDNode *N) {
   CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
 }
 
 /// GetVBR - decode a vbr encoding whose top bit is set.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline uint64_t
 GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
   assert(Val >= 128 && "Not a VBR");
   Val &= 127;  // Remove first vbr bit.
 
   unsigned Shift = 7;
   uint64_t NextBits;
   do {
     NextBits = MatcherTable[Idx++];
     Val |= (NextBits&127) << Shift;
     Shift += 7;
   } while (NextBits & 128);
 
   return Val;
 }
 
 /// When a match is complete, this method updates uses of interior chain results
 /// to use the new results.
 void SelectionDAGISel::UpdateChains(
     SDNode *NodeToMatch, SDValue InputChain,
     SmallVectorImpl<SDNode *> &ChainNodesMatched, bool isMorphNodeTo) {
   SmallVector<SDNode*, 4> NowDeadNodes;
 
   // Now that all the normal results are replaced, we replace the chain and
   // glue results if present.
   if (!ChainNodesMatched.empty()) {
     assert(InputChain.getNode() &&
            "Matched input chains but didn't produce a chain");
     // Loop over all of the nodes we matched that produced a chain result.
     // Replace all the chain results with the final chain we ended up with.
     for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
       SDNode *ChainNode = ChainNodesMatched[i];
       // If ChainNode is null, it's because we replaced it on a previous
       // iteration and we cleared it out of the map. Just skip it.
       if (!ChainNode)
         continue;
 
       assert(ChainNode->getOpcode() != ISD::DELETED_NODE &&
              "Deleted node left in chain");
 
       // Don't replace the results of the root node if we're doing a
       // MorphNodeTo.
       if (ChainNode == NodeToMatch && isMorphNodeTo)
         continue;
 
       SDValue ChainVal = SDValue(ChainNode, ChainNode->getNumValues()-1);
       if (ChainVal.getValueType() == MVT::Glue)
         ChainVal = ChainVal.getValue(ChainVal->getNumValues()-2);
       assert(ChainVal.getValueType() == MVT::Other && "Not a chain?");
       SelectionDAG::DAGNodeDeletedListener NDL(
           *CurDAG, [&](SDNode *N, SDNode *E) {
             std::replace(ChainNodesMatched.begin(), ChainNodesMatched.end(), N,
                          static_cast<SDNode *>(nullptr));
           });
       CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
 
       // If the node became dead and we haven't already seen it, delete it.
       if (ChainNode != NodeToMatch && ChainNode->use_empty() &&
           !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), ChainNode))
         NowDeadNodes.push_back(ChainNode);
     }
   }
 
   if (!NowDeadNodes.empty())
     CurDAG->RemoveDeadNodes(NowDeadNodes);
 
   DEBUG(dbgs() << "ISEL: Match complete!\n");
 }
 
 enum ChainResult {
   CR_Simple,
   CR_InducesCycle,
   CR_LeadsToInteriorNode
 };
 
 /// WalkChainUsers - Walk down the users of the specified chained node that is
 /// part of the pattern we're matching, looking at all of the users we find.
 /// This determines whether something is an interior node, whether we have a
 /// non-pattern node in between two pattern nodes (which prevent folding because
 /// it would induce a cycle) and whether we have a TokenFactor node sandwiched
 /// between pattern nodes (in which case the TF becomes part of the pattern).
 ///
 /// The walk we do here is guaranteed to be small because we quickly get down to
 /// already selected nodes "below" us.
 static ChainResult
 WalkChainUsers(const SDNode *ChainedNode,
                SmallVectorImpl<SDNode *> &ChainedNodesInPattern,
                DenseMap<const SDNode *, ChainResult> &TokenFactorResult,
                SmallVectorImpl<SDNode *> &InteriorChainedNodes) {
   ChainResult Result = CR_Simple;
 
   for (SDNode::use_iterator UI = ChainedNode->use_begin(),
          E = ChainedNode->use_end(); UI != E; ++UI) {
     // Make sure the use is of the chain, not some other value we produce.
     if (UI.getUse().getValueType() != MVT::Other) continue;
 
     SDNode *User = *UI;
 
     if (User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
       continue;
 
     // If we see an already-selected machine node, then we've gone beyond the
     // pattern that we're selecting down into the already selected chunk of the
     // DAG.
     unsigned UserOpcode = User->getOpcode();
     if (User->isMachineOpcode() ||
         UserOpcode == ISD::CopyToReg ||
         UserOpcode == ISD::CopyFromReg ||
         UserOpcode == ISD::INLINEASM ||
         UserOpcode == ISD::EH_LABEL ||
         UserOpcode == ISD::LIFETIME_START ||
         UserOpcode == ISD::LIFETIME_END) {
       // If their node ID got reset to -1 then they've already been selected.
       // Treat them like a MachineOpcode.
       if (User->getNodeId() == -1)
         continue;
     }
 
     // If we have a TokenFactor, we handle it specially.
     if (User->getOpcode() != ISD::TokenFactor) {
       // If the node isn't a token factor and isn't part of our pattern, then it
       // must be a random chained node in between two nodes we're selecting.
       // This happens when we have something like:
       //   x = load ptr
       //   call
       //   y = x+4
       //   store y -> ptr
       // Because we structurally match the load/store as a read/modify/write,
       // but the call is chained between them.  We cannot fold in this case
       // because it would induce a cycle in the graph.
       if (!std::count(ChainedNodesInPattern.begin(),
                       ChainedNodesInPattern.end(), User))
         return CR_InducesCycle;
 
       // Otherwise we found a node that is part of our pattern.  For example in:
       //   x = load ptr
       //   y = x+4
       //   store y -> ptr
       // This would happen when we're scanning down from the load and see the
       // store as a user.  Record that there is a use of ChainedNode that is
       // part of the pattern and keep scanning uses.
       Result = CR_LeadsToInteriorNode;
       InteriorChainedNodes.push_back(User);
       continue;
     }
 
     // If we found a TokenFactor, there are two cases to consider: first if the
     // TokenFactor is just hanging "below" the pattern we're matching (i.e. no
     // uses of the TF are in our pattern) we just want to ignore it.  Second,
     // the TokenFactor can be sandwiched in between two chained nodes, like so:
     //     [Load chain]
     //         ^
     //         |
     //       [Load]
     //       ^    ^
     //       |    \                    DAG's like cheese
     //      /       \                       do you?
     //     /         |
     // [TokenFactor] [Op]
     //     ^          ^
     //     |          |
     //      \        /
     //       \      /
     //       [Store]
     //
     // In this case, the TokenFactor becomes part of our match and we rewrite it
     // as a new TokenFactor.
     //
     // To distinguish these two cases, do a recursive walk down the uses.
     auto MemoizeResult = TokenFactorResult.find(User);
     bool Visited = MemoizeResult != TokenFactorResult.end();
     // Recursively walk chain users only if the result is not memoized.
     if (!Visited) {
       auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult,
                                 InteriorChainedNodes);
       MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first;
     }
     switch (MemoizeResult->second) {
     case CR_Simple:
       // If the uses of the TokenFactor are just already-selected nodes, ignore
       // it, it is "below" our pattern.
       continue;
     case CR_InducesCycle:
       // If the uses of the TokenFactor lead to nodes that are not part of our
       // pattern that are not selected, folding would turn this into a cycle,
       // bail out now.
       return CR_InducesCycle;
     case CR_LeadsToInteriorNode:
       break;  // Otherwise, keep processing.
     }
 
     // Okay, we know we're in the interesting interior case.  The TokenFactor
     // is now going to be considered part of the pattern so that we rewrite its
     // uses (it may have uses that are not part of the pattern) with the
     // ultimate chain result of the generated code.  We will also add its chain
     // inputs as inputs to the ultimate TokenFactor we create.
     Result = CR_LeadsToInteriorNode;
     if (!Visited) {
       ChainedNodesInPattern.push_back(User);
       InteriorChainedNodes.push_back(User);
     }
   }
 
   return Result;
 }
 
 /// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains
 /// operation for when the pattern matched at least one node with a chains.  The
 /// input vector contains a list of all of the chained nodes that we match.  We
 /// must determine if this is a valid thing to cover (i.e. matching it won't
 /// induce cycles in the DAG) and if so, creating a TokenFactor node. that will
 /// be used as the input node chain for the generated nodes.
 static SDValue
 HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
                        SelectionDAG *CurDAG) {
   // Used for memoization. Without it WalkChainUsers could take exponential
   // time to run.
   DenseMap<const SDNode *, ChainResult> TokenFactorResult;
   // Walk all of the chained nodes we've matched, recursively scanning down the
   // users of the chain result. This adds any TokenFactor nodes that are caught
   // in between chained nodes to the chained and interior nodes list.
   SmallVector<SDNode*, 3> InteriorChainedNodes;
   for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
     if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched,
                        TokenFactorResult,
                        InteriorChainedNodes) == CR_InducesCycle)
       return SDValue(); // Would induce a cycle.
   }
 
   // Okay, we have walked all the matched nodes and collected TokenFactor nodes
   // that we are interested in.  Form our input TokenFactor node.
   SmallVector<SDValue, 3> InputChains;
   for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
     // Add the input chain of this node to the InputChains list (which will be
     // the operands of the generated TokenFactor) if it's not an interior node.
     SDNode *N = ChainNodesMatched[i];
     if (N->getOpcode() != ISD::TokenFactor) {
       if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N))
         continue;
 
       // Otherwise, add the input chain.
       SDValue InChain = ChainNodesMatched[i]->getOperand(0);
       assert(InChain.getValueType() == MVT::Other && "Not a chain");
       InputChains.push_back(InChain);
       continue;
     }
 
     // If we have a token factor, we want to add all inputs of the token factor
     // that are not part of the pattern we're matching.
     for (const SDValue &Op : N->op_values()) {
       if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(),
                       Op.getNode()))
         InputChains.push_back(Op);
     }
   }
 
   if (InputChains.size() == 1)
     return InputChains[0];
   return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
                          MVT::Other, InputChains);
 }
 
 /// MorphNode - Handle morphing a node in place for the selector.
 SDNode *SelectionDAGISel::
 MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
           ArrayRef<SDValue> Ops, unsigned EmitNodeInfo) {
   // It is possible we're using MorphNodeTo to replace a node with no
   // normal results with one that has a normal result (or we could be
   // adding a chain) and the input could have glue and chains as well.
   // In this case we need to shift the operands down.
   // FIXME: This is a horrible hack and broken in obscure cases, no worse
   // than the old isel though.
   int OldGlueResultNo = -1, OldChainResultNo = -1;
 
   unsigned NTMNumResults = Node->getNumValues();
   if (Node->getValueType(NTMNumResults-1) == MVT::Glue) {
     OldGlueResultNo = NTMNumResults-1;
     if (NTMNumResults != 1 &&
         Node->getValueType(NTMNumResults-2) == MVT::Other)
       OldChainResultNo = NTMNumResults-2;
   } else if (Node->getValueType(NTMNumResults-1) == MVT::Other)
     OldChainResultNo = NTMNumResults-1;
 
   // Call the underlying SelectionDAG routine to do the transmogrification. Note
   // that this deletes operands of the old node that become dead.
   SDNode *Res = CurDAG->MorphNodeTo(Node, ~TargetOpc, VTList, Ops);
 
   // MorphNodeTo can operate in two ways: if an existing node with the
   // specified operands exists, it can just return it.  Otherwise, it
   // updates the node in place to have the requested operands.
   if (Res == Node) {
     // If we updated the node in place, reset the node ID.  To the isel,
     // this should be just like a newly allocated machine node.
     Res->setNodeId(-1);
   }
 
   unsigned ResNumResults = Res->getNumValues();
   // Move the glue if needed.
   if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 &&
       (unsigned)OldGlueResultNo != ResNumResults-1)
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo),
                                       SDValue(Res, ResNumResults-1));
 
   if ((EmitNodeInfo & OPFL_GlueOutput) != 0)
     --ResNumResults;
 
   // Move the chain reference if needed.
   if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
       (unsigned)OldChainResultNo != ResNumResults-1)
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo),
                                       SDValue(Res, ResNumResults-1));
 
   // Otherwise, no replacement happened because the node already exists. Replace
   // Uses of the old node with the new one.
   if (Res != Node) {
     CurDAG->ReplaceAllUsesWith(Node, Res);
     CurDAG->RemoveDeadNode(Node);
   }
 
   return Res;
 }
 
 /// CheckSame - Implements OP_CheckSame.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
           SDValue N,
           const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
   // Accept if it is exactly the same as a previously recorded node.
   unsigned RecNo = MatcherTable[MatcherIndex++];
   assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
   return N == RecordedNodes[RecNo].first;
 }
 
 /// CheckChildSame - Implements OP_CheckChildXSame.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
               SDValue N,
               const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes,
               unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
   return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
                      RecordedNodes);
 }
 
 /// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                       const SelectionDAGISel &SDISel) {
   return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
 }
 
 /// CheckNodePredicate - Implements OP_CheckNodePredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                    const SelectionDAGISel &SDISel, SDNode *N) {
   return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
             SDNode *N) {
   uint16_t Opc = MatcherTable[MatcherIndex++];
   Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
   return N->getOpcode() == Opc;
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
           const TargetLowering *TLI, const DataLayout &DL) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
   if (N.getValueType() == VT) return true;
 
   // Handle the case when VT is iPTR.
   return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy(DL);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                SDValue N, const TargetLowering *TLI, const DataLayout &DL,
                unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
   return ::CheckType(MatcherTable, MatcherIndex, N.getOperand(ChildNo), TLI,
                      DL);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
               SDValue N) {
   return cast<CondCodeSDNode>(N)->get() ==
       (ISD::CondCode)MatcherTable[MatcherIndex++];
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                SDValue N, const TargetLowering *TLI, const DataLayout &DL) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
   if (cast<VTSDNode>(N)->getVT() == VT)
     return true;
 
   // Handle the case when VT is iPTR.
   return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy(DL);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
              SDValue N) {
   int64_t Val = MatcherTable[MatcherIndex++];
   if (Val & 128)
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
   return C && C->getSExtValue() == Val;
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
                   SDValue N, unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
   return ::CheckInteger(MatcherTable, MatcherIndex, N.getOperand(ChildNo));
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
             SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
   if (Val & 128)
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
 
   if (N->getOpcode() != ISD::AND) return false;
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   return C && SDISel.CheckAndMask(N.getOperand(0), C, Val);
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
            SDValue N, const SelectionDAGISel &SDISel) {
   int64_t Val = MatcherTable[MatcherIndex++];
   if (Val & 128)
     Val = GetVBR(Val, MatcherTable, MatcherIndex);
 
   if (N->getOpcode() != ISD::OR) return false;
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   return C && SDISel.CheckOrMask(N.getOperand(0), C, Val);
 }
 
 /// IsPredicateKnownToFail - If we know how and can do so without pushing a
 /// scope, evaluate the current node.  If the current predicate is known to
 /// fail, set Result=true and return anything.  If the current predicate is
 /// known to pass, set Result=false and return the MatcherIndex to continue
 /// with.  If the current predicate is unknown, set Result=false and return the
 /// MatcherIndex to continue with.
 static unsigned IsPredicateKnownToFail(const unsigned char *Table,
                                        unsigned Index, SDValue N,
                                        bool &Result,
                                        const SelectionDAGISel &SDISel,
                   SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
   switch (Table[Index++]) {
   default:
     Result = false;
     return Index-1;  // Could not evaluate this predicate.
   case SelectionDAGISel::OPC_CheckSame:
     Result = !::CheckSame(Table, Index, N, RecordedNodes);
     return Index;
   case SelectionDAGISel::OPC_CheckChild0Same:
   case SelectionDAGISel::OPC_CheckChild1Same:
   case SelectionDAGISel::OPC_CheckChild2Same:
   case SelectionDAGISel::OPC_CheckChild3Same:
     Result = !::CheckChildSame(Table, Index, N, RecordedNodes,
                         Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Same);
     return Index;
   case SelectionDAGISel::OPC_CheckPatternPredicate:
     Result = !::CheckPatternPredicate(Table, Index, SDISel);
     return Index;
   case SelectionDAGISel::OPC_CheckPredicate:
     Result = !::CheckNodePredicate(Table, Index, SDISel, N.getNode());
     return Index;
   case SelectionDAGISel::OPC_CheckOpcode:
     Result = !::CheckOpcode(Table, Index, N.getNode());
     return Index;
   case SelectionDAGISel::OPC_CheckType:
     Result = !::CheckType(Table, Index, N, SDISel.TLI,
                           SDISel.CurDAG->getDataLayout());
     return Index;
   case SelectionDAGISel::OPC_CheckTypeRes: {
     unsigned Res = Table[Index++];
     Result = !::CheckType(Table, Index, N.getValue(Res), SDISel.TLI,
                           SDISel.CurDAG->getDataLayout());
     return Index;
   }
   case SelectionDAGISel::OPC_CheckChild0Type:
   case SelectionDAGISel::OPC_CheckChild1Type:
   case SelectionDAGISel::OPC_CheckChild2Type:
   case SelectionDAGISel::OPC_CheckChild3Type:
   case SelectionDAGISel::OPC_CheckChild4Type:
   case SelectionDAGISel::OPC_CheckChild5Type:
   case SelectionDAGISel::OPC_CheckChild6Type:
   case SelectionDAGISel::OPC_CheckChild7Type:
     Result = !::CheckChildType(
                  Table, Index, N, SDISel.TLI, SDISel.CurDAG->getDataLayout(),
                  Table[Index - 1] - SelectionDAGISel::OPC_CheckChild0Type);
     return Index;
   case SelectionDAGISel::OPC_CheckCondCode:
     Result = !::CheckCondCode(Table, Index, N);
     return Index;
   case SelectionDAGISel::OPC_CheckValueType:
     Result = !::CheckValueType(Table, Index, N, SDISel.TLI,
                                SDISel.CurDAG->getDataLayout());
     return Index;
   case SelectionDAGISel::OPC_CheckInteger:
     Result = !::CheckInteger(Table, Index, N);
     return Index;
   case SelectionDAGISel::OPC_CheckChild0Integer:
   case SelectionDAGISel::OPC_CheckChild1Integer:
   case SelectionDAGISel::OPC_CheckChild2Integer:
   case SelectionDAGISel::OPC_CheckChild3Integer:
   case SelectionDAGISel::OPC_CheckChild4Integer:
     Result = !::CheckChildInteger(Table, Index, N,
                      Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Integer);
     return Index;
   case SelectionDAGISel::OPC_CheckAndImm:
     Result = !::CheckAndImm(Table, Index, N, SDISel);
     return Index;
   case SelectionDAGISel::OPC_CheckOrImm:
     Result = !::CheckOrImm(Table, Index, N, SDISel);
     return Index;
   }
 }
 
 namespace {
 
 struct MatchScope {
   /// FailIndex - If this match fails, this is the index to continue with.
   unsigned FailIndex;
 
   /// NodeStack - The node stack when the scope was formed.
   SmallVector<SDValue, 4> NodeStack;
 
   /// NumRecordedNodes - The number of recorded nodes when the scope was formed.
   unsigned NumRecordedNodes;
 
   /// NumMatchedMemRefs - The number of matched memref entries.
   unsigned NumMatchedMemRefs;
 
   /// InputChain/InputGlue - The current chain/glue
   SDValue InputChain, InputGlue;
 
   /// HasChainNodesMatched - True if the ChainNodesMatched list is non-empty.
   bool HasChainNodesMatched;
 };
 
 /// \\brief A DAG update listener to keep the matching state
 /// (i.e. RecordedNodes and MatchScope) uptodate if the target is allowed to
 /// change the DAG while matching.  X86 addressing mode matcher is an example
 /// for this.
 class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
 {
   SDNode **NodeToMatch;
   SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes;
   SmallVectorImpl<MatchScope> &MatchScopes;
 
 public:
   MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch,
                     SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN,
                     SmallVectorImpl<MatchScope> &MS)
       : SelectionDAG::DAGUpdateListener(DAG), NodeToMatch(NodeToMatch),
         RecordedNodes(RN), MatchScopes(MS) {}
 
   void NodeDeleted(SDNode *N, SDNode *E) override {
     // Some early-returns here to avoid the search if we deleted the node or
     // if the update comes from MorphNodeTo (MorphNodeTo is the last thing we
     // do, so it's unnecessary to update matching state at that point).
     // Neither of these can occur currently because we only install this
     // update listener during matching a complex patterns.
     if (!E || E->isMachineOpcode())
       return;
     // Check if NodeToMatch was updated.
     if (N == *NodeToMatch)
       *NodeToMatch = E;
     // Performing linear search here does not matter because we almost never
     // run this code.  You'd have to have a CSE during complex pattern
     // matching.
     for (auto &I : RecordedNodes)
       if (I.first.getNode() == N)
         I.first.setNode(E);
 
     for (auto &I : MatchScopes)
       for (auto &J : I.NodeStack)
         if (J.getNode() == N)
           J.setNode(E);
   }
 };
 
 } // end anonymous namespace
 
 void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
                                         const unsigned char *MatcherTable,
                                         unsigned TableSize) {
   // FIXME: Should these even be selected?  Handle these cases in the caller?
   switch (NodeToMatch->getOpcode()) {
   default:
     break;
   case ISD::EntryToken:       // These nodes remain the same.
   case ISD::BasicBlock:
   case ISD::Register:
   case ISD::RegisterMask:
   case ISD::HANDLENODE:
   case ISD::MDNODE_SDNODE:
   case ISD::TargetConstant:
   case ISD::TargetConstantFP:
   case ISD::TargetConstantPool:
   case ISD::TargetFrameIndex:
   case ISD::TargetExternalSymbol:
   case ISD::MCSymbol:
   case ISD::TargetBlockAddress:
   case ISD::TargetJumpTable:
   case ISD::TargetGlobalTLSAddress:
   case ISD::TargetGlobalAddress:
   case ISD::TokenFactor:
   case ISD::CopyFromReg:
   case ISD::CopyToReg:
   case ISD::EH_LABEL:
   case ISD::ANNOTATION_LABEL:
   case ISD::LIFETIME_START:
   case ISD::LIFETIME_END:
     NodeToMatch->setNodeId(-1); // Mark selected.
     return;
   case ISD::AssertSext:
   case ISD::AssertZext:
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
                                       NodeToMatch->getOperand(0));
     CurDAG->RemoveDeadNode(NodeToMatch);
     return;
   case ISD::INLINEASM:
     Select_INLINEASM(NodeToMatch);
     return;
   case ISD::READ_REGISTER:
     Select_READ_REGISTER(NodeToMatch);
     return;
   case ISD::WRITE_REGISTER:
     Select_WRITE_REGISTER(NodeToMatch);
     return;
   case ISD::UNDEF:
     Select_UNDEF(NodeToMatch);
     return;
   }
 
   assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
 
   // Set up the node stack with NodeToMatch as the only node on the stack.
   SmallVector<SDValue, 8> NodeStack;
   SDValue N = SDValue(NodeToMatch, 0);
   NodeStack.push_back(N);
 
   // MatchScopes - Scopes used when matching, if a match failure happens, this
   // indicates where to continue checking.
   SmallVector<MatchScope, 8> MatchScopes;
 
   // RecordedNodes - This is the set of nodes that have been recorded by the
   // state machine.  The second value is the parent of the node, or null if the
   // root is recorded.
   SmallVector<std::pair<SDValue, SDNode*>, 8> RecordedNodes;
 
   // MatchedMemRefs - This is the set of MemRef's we've seen in the input
   // pattern.
   SmallVector<MachineMemOperand*, 2> MatchedMemRefs;
 
   // These are the current input chain and glue for use when generating nodes.
   // Various Emit operations change these.  For example, emitting a copytoreg
   // uses and updates these.
   SDValue InputChain, InputGlue;
 
   // ChainNodesMatched - If a pattern matches nodes that have input/output
   // chains, the OPC_EmitMergeInputChains operation is emitted which indicates
   // which ones they are.  The result is captured into this list so that we can
   // update the chain results when the pattern is complete.
   SmallVector<SDNode*, 3> ChainNodesMatched;
 
   DEBUG(dbgs() << "ISEL: Starting pattern match on root node: ";
         NodeToMatch->dump(CurDAG);
         dbgs() << '\n');
 
   // Determine where to start the interpreter.  Normally we start at opcode #0,
   // but if the state machine starts with an OPC_SwitchOpcode, then we
   // accelerate the first lookup (which is guaranteed to be hot) with the
   // OpcodeOffset table.
   unsigned MatcherIndex = 0;
 
   if (!OpcodeOffset.empty()) {
     // Already computed the OpcodeOffset table, just index into it.
     if (N.getOpcode() < OpcodeOffset.size())
       MatcherIndex = OpcodeOffset[N.getOpcode()];
     DEBUG(dbgs() << "  Initial Opcode index to " << MatcherIndex << "\n");
 
   } else if (MatcherTable[0] == OPC_SwitchOpcode) {
     // Otherwise, the table isn't computed, but the state machine does start
     // with an OPC_SwitchOpcode instruction.  Populate the table now, since this
     // is the first time we're selecting an instruction.
     unsigned Idx = 1;
     while (true) {
       // Get the size of this case.
       unsigned CaseSize = MatcherTable[Idx++];
       if (CaseSize & 128)
         CaseSize = GetVBR(CaseSize, MatcherTable, Idx);
       if (CaseSize == 0) break;
 
       // Get the opcode, add the index to the table.
       uint16_t Opc = MatcherTable[Idx++];
       Opc |= (unsigned short)MatcherTable[Idx++] << 8;
       if (Opc >= OpcodeOffset.size())
         OpcodeOffset.resize((Opc+1)*2);
       OpcodeOffset[Opc] = Idx;
       Idx += CaseSize;
     }
 
     // Okay, do the lookup for the first opcode.
     if (N.getOpcode() < OpcodeOffset.size())
       MatcherIndex = OpcodeOffset[N.getOpcode()];
   }
 
   while (true) {
     assert(MatcherIndex < TableSize && "Invalid index");
 #ifndef NDEBUG
     unsigned CurrentOpcodeIndex = MatcherIndex;
 #endif
     BuiltinOpcodes Opcode = (BuiltinOpcodes)MatcherTable[MatcherIndex++];
     switch (Opcode) {
     case OPC_Scope: {
       // Okay, the semantics of this operation are that we should push a scope
       // then evaluate the first child.  However, pushing a scope only to have
       // the first check fail (which then pops it) is inefficient.  If we can
       // determine immediately that the first check (or first several) will
       // immediately fail, don't even bother pushing a scope for them.
       unsigned FailIndex;
 
       while (true) {
         unsigned NumToSkip = MatcherTable[MatcherIndex++];
         if (NumToSkip & 128)
           NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
         // Found the end of the scope with no match.
         if (NumToSkip == 0) {
           FailIndex = 0;
           break;
         }
 
         FailIndex = MatcherIndex+NumToSkip;
 
         unsigned MatcherIndexOfPredicate = MatcherIndex;
         (void)MatcherIndexOfPredicate; // silence warning.
 
         // If we can't evaluate this predicate without pushing a scope (e.g. if
         // it is a 'MoveParent') or if the predicate succeeds on this node, we
         // push the scope and evaluate the full predicate chain.
         bool Result;
         MatcherIndex = IsPredicateKnownToFail(MatcherTable, MatcherIndex, N,
                                               Result, *this, RecordedNodes);
         if (!Result)
           break;
 
         DEBUG(dbgs() << "  Skipped scope entry (due to false predicate) at "
                      << "index " << MatcherIndexOfPredicate
                      << ", continuing at " << FailIndex << "\n");
         ++NumDAGIselRetries;
 
         // Otherwise, we know that this case of the Scope is guaranteed to fail,
         // move to the next case.
         MatcherIndex = FailIndex;
       }
 
       // If the whole scope failed to match, bail.
       if (FailIndex == 0) break;
 
       // Push a MatchScope which indicates where to go if the first child fails
       // to match.
       MatchScope NewEntry;
       NewEntry.FailIndex = FailIndex;
       NewEntry.NodeStack.append(NodeStack.begin(), NodeStack.end());
       NewEntry.NumRecordedNodes = RecordedNodes.size();
       NewEntry.NumMatchedMemRefs = MatchedMemRefs.size();
       NewEntry.InputChain = InputChain;
       NewEntry.InputGlue = InputGlue;
       NewEntry.HasChainNodesMatched = !ChainNodesMatched.empty();
       MatchScopes.push_back(NewEntry);
       continue;
     }
     case OPC_RecordNode: {
       // Remember this node, it may end up being an operand in the pattern.
       SDNode *Parent = nullptr;
       if (NodeStack.size() > 1)
         Parent = NodeStack[NodeStack.size()-2].getNode();
       RecordedNodes.push_back(std::make_pair(N, Parent));
       continue;
     }
 
     case OPC_RecordChild0: case OPC_RecordChild1:
     case OPC_RecordChild2: case OPC_RecordChild3:
     case OPC_RecordChild4: case OPC_RecordChild5:
     case OPC_RecordChild6: case OPC_RecordChild7: {
       unsigned ChildNo = Opcode-OPC_RecordChild0;
       if (ChildNo >= N.getNumOperands())
         break;  // Match fails if out of range child #.
 
       RecordedNodes.push_back(std::make_pair(N->getOperand(ChildNo),
                                              N.getNode()));
       continue;
     }
     case OPC_RecordMemRef:
       if (auto *MN = dyn_cast<MemSDNode>(N))
         MatchedMemRefs.push_back(MN->getMemOperand());
       else {
         DEBUG(
           dbgs() << "Expected MemSDNode ";
           N->dump(CurDAG);
           dbgs() << '\n'
         );
       }
 
       continue;
 
     case OPC_CaptureGlueInput:
       // If the current node has an input glue, capture it in InputGlue.
       if (N->getNumOperands() != 0 &&
           N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue)
         InputGlue = N->getOperand(N->getNumOperands()-1);
       continue;
 
     case OPC_MoveChild: {
       unsigned ChildNo = MatcherTable[MatcherIndex++];
       if (ChildNo >= N.getNumOperands())
         break;  // Match fails if out of range child #.
       N = N.getOperand(ChildNo);
       NodeStack.push_back(N);
       continue;
     }
 
     case OPC_MoveChild0: case OPC_MoveChild1:
     case OPC_MoveChild2: case OPC_MoveChild3:
     case OPC_MoveChild4: case OPC_MoveChild5:
     case OPC_MoveChild6: case OPC_MoveChild7: {
       unsigned ChildNo = Opcode-OPC_MoveChild0;
       if (ChildNo >= N.getNumOperands())
         break;  // Match fails if out of range child #.
       N = N.getOperand(ChildNo);
       NodeStack.push_back(N);
       continue;
     }
 
     case OPC_MoveParent:
       // Pop the current node off the NodeStack.
       NodeStack.pop_back();
       assert(!NodeStack.empty() && "Node stack imbalance!");
       N = NodeStack.back();
       continue;
 
     case OPC_CheckSame:
       if (!::CheckSame(MatcherTable, MatcherIndex, N, RecordedNodes)) break;
       continue;
 
     case OPC_CheckChild0Same: case OPC_CheckChild1Same:
     case OPC_CheckChild2Same: case OPC_CheckChild3Same:
       if (!::CheckChildSame(MatcherTable, MatcherIndex, N, RecordedNodes,
                             Opcode-OPC_CheckChild0Same))
         break;
       continue;
 
     case OPC_CheckPatternPredicate:
       if (!::CheckPatternPredicate(MatcherTable, MatcherIndex, *this)) break;
       continue;
     case OPC_CheckPredicate:
       if (!::CheckNodePredicate(MatcherTable, MatcherIndex, *this,
                                 N.getNode()))
         break;
       continue;
     case OPC_CheckComplexPat: {
       unsigned CPNum = MatcherTable[MatcherIndex++];
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid CheckComplexPat");
 
       // If target can modify DAG during matching, keep the matching state
       // consistent.
       std::unique_ptr<MatchStateUpdater> MSU;
       if (ComplexPatternFuncMutatesDAG())
         MSU.reset(new MatchStateUpdater(*CurDAG, &NodeToMatch, RecordedNodes,
                                         MatchScopes));
 
       if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo].second,
                                RecordedNodes[RecNo].first, CPNum,
                                RecordedNodes))
         break;
       continue;
     }
     case OPC_CheckOpcode:
       if (!::CheckOpcode(MatcherTable, MatcherIndex, N.getNode())) break;
       continue;
 
     case OPC_CheckType:
       if (!::CheckType(MatcherTable, MatcherIndex, N, TLI,
                        CurDAG->getDataLayout()))
         break;
       continue;
 
     case OPC_CheckTypeRes: {
       unsigned Res = MatcherTable[MatcherIndex++];
       if (!::CheckType(MatcherTable, MatcherIndex, N.getValue(Res), TLI,
                        CurDAG->getDataLayout()))
         break;
       continue;
     }
 
     case OPC_SwitchOpcode: {
       unsigned CurNodeOpcode = N.getOpcode();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
       while (true) {
         // Get the size of this case.
         CaseSize = MatcherTable[MatcherIndex++];
         if (CaseSize & 128)
           CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
         if (CaseSize == 0) break;
 
         uint16_t Opc = MatcherTable[MatcherIndex++];
         Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
 
         // If the opcode matches, then we will execute this case.
         if (CurNodeOpcode == Opc)
           break;
 
         // Otherwise, skip over this case.
         MatcherIndex += CaseSize;
       }
 
       // If no cases matched, bail out.
       if (CaseSize == 0) break;
 
       // Otherwise, execute the case we found.
       DEBUG(dbgs() << "  OpcodeSwitch from " << SwitchStart
                    << " to " << MatcherIndex << "\n");
       continue;
     }
 
     case OPC_SwitchType: {
       MVT CurNodeVT = N.getSimpleValueType();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
       while (true) {
         // Get the size of this case.
         CaseSize = MatcherTable[MatcherIndex++];
         if (CaseSize & 128)
           CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
         if (CaseSize == 0) break;
 
         MVT CaseVT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
         if (CaseVT == MVT::iPTR)
           CaseVT = TLI->getPointerTy(CurDAG->getDataLayout());
 
         // If the VT matches, then we will execute this case.
         if (CurNodeVT == CaseVT)
           break;
 
         // Otherwise, skip over this case.
         MatcherIndex += CaseSize;
       }
 
       // If no cases matched, bail out.
       if (CaseSize == 0) break;
 
       // Otherwise, execute the case we found.
       DEBUG(dbgs() << "  TypeSwitch[" << EVT(CurNodeVT).getEVTString()
                    << "] from " << SwitchStart << " to " << MatcherIndex<<'\n');
       continue;
     }
     case OPC_CheckChild0Type: case OPC_CheckChild1Type:
     case OPC_CheckChild2Type: case OPC_CheckChild3Type:
     case OPC_CheckChild4Type: case OPC_CheckChild5Type:
     case OPC_CheckChild6Type: case OPC_CheckChild7Type:
       if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
                             CurDAG->getDataLayout(),
                             Opcode - OPC_CheckChild0Type))
         break;
       continue;
     case OPC_CheckCondCode:
       if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
       continue;
     case OPC_CheckValueType:
       if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI,
                             CurDAG->getDataLayout()))
         break;
       continue;
     case OPC_CheckInteger:
       if (!::CheckInteger(MatcherTable, MatcherIndex, N)) break;
       continue;
     case OPC_CheckChild0Integer: case OPC_CheckChild1Integer:
     case OPC_CheckChild2Integer: case OPC_CheckChild3Integer:
     case OPC_CheckChild4Integer:
       if (!::CheckChildInteger(MatcherTable, MatcherIndex, N,
                                Opcode-OPC_CheckChild0Integer)) break;
       continue;
     case OPC_CheckAndImm:
       if (!::CheckAndImm(MatcherTable, MatcherIndex, N, *this)) break;
       continue;
     case OPC_CheckOrImm:
       if (!::CheckOrImm(MatcherTable, MatcherIndex, N, *this)) break;
       continue;
 
     case OPC_CheckFoldableChainNode: {
       assert(NodeStack.size() != 1 && "No parent node");
       // Verify that all intermediate nodes between the root and this one have
       // a single use.
       bool HasMultipleUses = false;
       for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i)
         if (!NodeStack[i].getNode()->hasOneUse()) {
           HasMultipleUses = true;
           break;
         }
       if (HasMultipleUses) break;
 
       // Check to see that the target thinks this is profitable to fold and that
       // we can fold it without inducing cycles in the graph.
       if (!IsProfitableToFold(N, NodeStack[NodeStack.size()-2].getNode(),
                               NodeToMatch) ||
           !IsLegalToFold(N, NodeStack[NodeStack.size()-2].getNode(),
                          NodeToMatch, OptLevel,
                          true/*We validate our own chains*/))
         break;
 
       continue;
     }
     case OPC_EmitInteger: {
       MVT::SimpleValueType VT =
         (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
       int64_t Val = MatcherTable[MatcherIndex++];
       if (Val & 128)
         Val = GetVBR(Val, MatcherTable, MatcherIndex);
       RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
                               CurDAG->getTargetConstant(Val, SDLoc(NodeToMatch),
                                                         VT), nullptr));
       continue;
     }
     case OPC_EmitRegister: {
       MVT::SimpleValueType VT =
         (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
       unsigned RegNo = MatcherTable[MatcherIndex++];
       RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
                               CurDAG->getRegister(RegNo, VT), nullptr));
       continue;
     }
     case OPC_EmitRegister2: {
       // For targets w/ more than 256 register names, the register enum
       // values are stored in two bytes in the matcher table (just like
       // opcodes).
       MVT::SimpleValueType VT =
         (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
       unsigned RegNo = MatcherTable[MatcherIndex++];
       RegNo |= MatcherTable[MatcherIndex++] << 8;
       RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
                               CurDAG->getRegister(RegNo, VT), nullptr));
       continue;
     }
 
     case OPC_EmitConvertToTarget:  {
       // Convert from IMM/FPIMM to target version.
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid EmitConvertToTarget");
       SDValue Imm = RecordedNodes[RecNo].first;
 
       if (Imm->getOpcode() == ISD::Constant) {
         const ConstantInt *Val=cast<ConstantSDNode>(Imm)->getConstantIntValue();
         Imm = CurDAG->getTargetConstant(*Val, SDLoc(NodeToMatch),
                                         Imm.getValueType());
       } else if (Imm->getOpcode() == ISD::ConstantFP) {
         const ConstantFP *Val=cast<ConstantFPSDNode>(Imm)->getConstantFPValue();
         Imm = CurDAG->getTargetConstantFP(*Val, SDLoc(NodeToMatch),
                                           Imm.getValueType());
       }
 
       RecordedNodes.push_back(std::make_pair(Imm, RecordedNodes[RecNo].second));
       continue;
     }
 
     case OPC_EmitMergeInputChains1_0:    // OPC_EmitMergeInputChains, 1, 0
     case OPC_EmitMergeInputChains1_1:    // OPC_EmitMergeInputChains, 1, 1
     case OPC_EmitMergeInputChains1_2: {  // OPC_EmitMergeInputChains, 1, 2
       // These are space-optimized forms of OPC_EmitMergeInputChains.
       assert(!InputChain.getNode() &&
              "EmitMergeInputChains should be the first chain producing node");
       assert(ChainNodesMatched.empty() &&
              "Should only have one EmitMergeInputChains per match");
 
       // Read all of the chained nodes.
       unsigned RecNo = Opcode - OPC_EmitMergeInputChains1_0;
       assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
       ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
       // FIXME: What if other value results of the node have uses not matched
       // by this pattern?
       if (ChainNodesMatched.back() != NodeToMatch &&
           !RecordedNodes[RecNo].first.hasOneUse()) {
         ChainNodesMatched.clear();
         break;
       }
 
       // Merge the input chains if they are not intra-pattern references.
       InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
 
       if (!InputChain.getNode())
         break;  // Failed to merge.
       continue;
     }
 
     case OPC_EmitMergeInputChains: {
       assert(!InputChain.getNode() &&
              "EmitMergeInputChains should be the first chain producing node");
       // This node gets a list of nodes we matched in the input that have
       // chains.  We want to token factor all of the input chains to these nodes
       // together.  However, if any of the input chains is actually one of the
       // nodes matched in this pattern, then we have an intra-match reference.
       // Ignore these because the newly token factored chain should not refer to
       // the old nodes.
       unsigned NumChains = MatcherTable[MatcherIndex++];
       assert(NumChains != 0 && "Can't TF zero chains");
 
       assert(ChainNodesMatched.empty() &&
              "Should only have one EmitMergeInputChains per match");
 
       // Read all of the chained nodes.
       for (unsigned i = 0; i != NumChains; ++i) {
         unsigned RecNo = MatcherTable[MatcherIndex++];
         assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
         ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
         // FIXME: What if other value results of the node have uses not matched
         // by this pattern?
         if (ChainNodesMatched.back() != NodeToMatch &&
             !RecordedNodes[RecNo].first.hasOneUse()) {
           ChainNodesMatched.clear();
           break;
         }
       }
 
       // If the inner loop broke out, the match fails.
       if (ChainNodesMatched.empty())
         break;
 
       // Merge the input chains if they are not intra-pattern references.
       InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
 
       if (!InputChain.getNode())
         break;  // Failed to merge.
 
       continue;
     }
 
     case OPC_EmitCopyToReg: {
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
       unsigned DestPhysReg = MatcherTable[MatcherIndex++];
 
       if (!InputChain.getNode())
         InputChain = CurDAG->getEntryNode();
 
       InputChain = CurDAG->getCopyToReg(InputChain, SDLoc(NodeToMatch),
                                         DestPhysReg, RecordedNodes[RecNo].first,
                                         InputGlue);
 
       InputGlue = InputChain.getValue(1);
       continue;
     }
 
     case OPC_EmitNodeXForm: {
       unsigned XFormNo = MatcherTable[MatcherIndex++];
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid EmitNodeXForm");
       SDValue Res = RunSDNodeXForm(RecordedNodes[RecNo].first, XFormNo);
       RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr));
       continue;
     }
     case OPC_Coverage: {
       // This is emitted right before MorphNode/EmitNode.
       // So it should be safe to assume that this node has been selected
       unsigned index = MatcherTable[MatcherIndex++];
       index |= (MatcherTable[MatcherIndex++] << 8);
       dbgs() << "COVERED: " << getPatternForIndex(index) << "\n";
       dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n";
       continue;
     }
 
     case OPC_EmitNode:     case OPC_MorphNodeTo:
     case OPC_EmitNode0:    case OPC_EmitNode1:    case OPC_EmitNode2:
     case OPC_MorphNodeTo0: case OPC_MorphNodeTo1: case OPC_MorphNodeTo2: {
       uint16_t TargetOpc = MatcherTable[MatcherIndex++];
       TargetOpc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
       unsigned EmitNodeInfo = MatcherTable[MatcherIndex++];
       // Get the result VT list.
       unsigned NumVTs;
       // If this is one of the compressed forms, get the number of VTs based
       // on the Opcode. Otherwise read the next byte from the table.
       if (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2)
         NumVTs = Opcode - OPC_MorphNodeTo0;
       else if (Opcode >= OPC_EmitNode0 && Opcode <= OPC_EmitNode2)
         NumVTs = Opcode - OPC_EmitNode0;
       else
         NumVTs = MatcherTable[MatcherIndex++];
       SmallVector<EVT, 4> VTs;
       for (unsigned i = 0; i != NumVTs; ++i) {
         MVT::SimpleValueType VT =
           (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
         if (VT == MVT::iPTR)
           VT = TLI->getPointerTy(CurDAG->getDataLayout()).SimpleTy;
         VTs.push_back(VT);
       }
 
       if (EmitNodeInfo & OPFL_Chain)
         VTs.push_back(MVT::Other);
       if (EmitNodeInfo & OPFL_GlueOutput)
         VTs.push_back(MVT::Glue);
 
       // This is hot code, so optimize the two most common cases of 1 and 2
       // results.
       SDVTList VTList;
       if (VTs.size() == 1)
         VTList = CurDAG->getVTList(VTs[0]);
       else if (VTs.size() == 2)
         VTList = CurDAG->getVTList(VTs[0], VTs[1]);
       else
         VTList = CurDAG->getVTList(VTs);
 
       // Get the operand list.
       unsigned NumOps = MatcherTable[MatcherIndex++];
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i != NumOps; ++i) {
         unsigned RecNo = MatcherTable[MatcherIndex++];
         if (RecNo & 128)
           RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
 
         assert(RecNo < RecordedNodes.size() && "Invalid EmitNode");
         Ops.push_back(RecordedNodes[RecNo].first);
       }
 
       // If there are variadic operands to add, handle them now.
       if (EmitNodeInfo & OPFL_VariadicInfo) {
         // Determine the start index to copy from.
         unsigned FirstOpToCopy = getNumFixedFromVariadicInfo(EmitNodeInfo);
         FirstOpToCopy += (EmitNodeInfo & OPFL_Chain) ? 1 : 0;
         assert(NodeToMatch->getNumOperands() >= FirstOpToCopy &&
                "Invalid variadic node");
         // Copy all of the variadic operands, not including a potential glue
         // input.
         for (unsigned i = FirstOpToCopy, e = NodeToMatch->getNumOperands();
              i != e; ++i) {
           SDValue V = NodeToMatch->getOperand(i);
           if (V.getValueType() == MVT::Glue) break;
           Ops.push_back(V);
         }
       }
 
       // If this has chain/glue inputs, add them.
       if (EmitNodeInfo & OPFL_Chain)
         Ops.push_back(InputChain);
       if ((EmitNodeInfo & OPFL_GlueInput) && InputGlue.getNode() != nullptr)
         Ops.push_back(InputGlue);
 
       // Create the node.
       MachineSDNode *Res = nullptr;
       bool IsMorphNodeTo = Opcode == OPC_MorphNodeTo ||
                      (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2);
       if (!IsMorphNodeTo) {
         // If this is a normal EmitNode command, just create the new node and
         // add the results to the RecordedNodes list.
         Res = CurDAG->getMachineNode(TargetOpc, SDLoc(NodeToMatch),
                                      VTList, Ops);
 
         // Add all the non-glue/non-chain results to the RecordedNodes list.
         for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
           if (VTs[i] == MVT::Other || VTs[i] == MVT::Glue) break;
           RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),
                                                              nullptr));
         }
       } else {
         assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE &&
                "NodeToMatch was removed partway through selection");
         SelectionDAG::DAGNodeDeletedListener NDL(*CurDAG, [&](SDNode *N,
                                                               SDNode *E) {
           CurDAG->salvageDebugInfo(*N);
           auto &Chain = ChainNodesMatched;
           assert((!E || !is_contained(Chain, N)) &&
                  "Chain node replaced during MorphNode");
           Chain.erase(std::remove(Chain.begin(), Chain.end(), N), Chain.end());
         });
         Res = cast<MachineSDNode>(MorphNode(NodeToMatch, TargetOpc, VTList,
                                             Ops, EmitNodeInfo));
       }
 
       // If the node had chain/glue results, update our notion of the current
       // chain and glue.
       if (EmitNodeInfo & OPFL_GlueOutput) {
         InputGlue = SDValue(Res, VTs.size()-1);
         if (EmitNodeInfo & OPFL_Chain)
           InputChain = SDValue(Res, VTs.size()-2);
       } else if (EmitNodeInfo & OPFL_Chain)
         InputChain = SDValue(Res, VTs.size()-1);
 
       // If the OPFL_MemRefs glue is set on this node, slap all of the
       // accumulated memrefs onto it.
       //
       // FIXME: This is vastly incorrect for patterns with multiple outputs
       // instructions that access memory and for ComplexPatterns that match
       // loads.
       if (EmitNodeInfo & OPFL_MemRefs) {
         // Only attach load or store memory operands if the generated
         // instruction may load or store.
         const MCInstrDesc &MCID = TII->get(TargetOpc);
         bool mayLoad = MCID.mayLoad();
         bool mayStore = MCID.mayStore();
 
         unsigned NumMemRefs = 0;
         for (SmallVectorImpl<MachineMemOperand *>::const_iterator I =
                MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
           if ((*I)->isLoad()) {
             if (mayLoad)
               ++NumMemRefs;
           } else if ((*I)->isStore()) {
             if (mayStore)
               ++NumMemRefs;
           } else {
             ++NumMemRefs;
           }
         }
 
         MachineSDNode::mmo_iterator MemRefs =
           MF->allocateMemRefsArray(NumMemRefs);
 
         MachineSDNode::mmo_iterator MemRefsPos = MemRefs;
         for (SmallVectorImpl<MachineMemOperand *>::const_iterator I =
                MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
           if ((*I)->isLoad()) {
             if (mayLoad)
               *MemRefsPos++ = *I;
           } else if ((*I)->isStore()) {
             if (mayStore)
               *MemRefsPos++ = *I;
           } else {
             *MemRefsPos++ = *I;
           }
         }
 
         Res->setMemRefs(MemRefs, MemRefs + NumMemRefs);
       }
 
       DEBUG(
         if (!MatchedMemRefs.empty() && Res->memoperands_empty())
           dbgs() << "  Dropping mem operands\n";
         dbgs() << "  "
                << (IsMorphNodeTo ? "Morphed" : "Created")
                << " node: ";
         Res->dump(CurDAG);
 
         dbgs() << '\n';
       );
 
       // If this was a MorphNodeTo then we're completely done!
       if (IsMorphNodeTo) {
         // Update chain uses.
         UpdateChains(Res, InputChain, ChainNodesMatched, true);
         return;
       }
       continue;
     }
 
     case OPC_CompleteMatch: {
       // The match has been completed, and any new nodes (if any) have been
       // created.  Patch up references to the matched dag to use the newly
       // created nodes.
       unsigned NumResults = MatcherTable[MatcherIndex++];
 
       for (unsigned i = 0; i != NumResults; ++i) {
         unsigned ResSlot = MatcherTable[MatcherIndex++];
         if (ResSlot & 128)
           ResSlot = GetVBR(ResSlot, MatcherTable, MatcherIndex);
 
         assert(ResSlot < RecordedNodes.size() && "Invalid CompleteMatch");
         SDValue Res = RecordedNodes[ResSlot].first;
 
         assert(i < NodeToMatch->getNumValues() &&
                NodeToMatch->getValueType(i) != MVT::Other &&
                NodeToMatch->getValueType(i) != MVT::Glue &&
                "Invalid number of results to complete!");
         assert((NodeToMatch->getValueType(i) == Res.getValueType() ||
                 NodeToMatch->getValueType(i) == MVT::iPTR ||
                 Res.getValueType() == MVT::iPTR ||
                 NodeToMatch->getValueType(i).getSizeInBits() ==
                     Res.getValueSizeInBits()) &&
                "invalid replacement");
         CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res);
       }
 
       // Update chain uses.
       UpdateChains(NodeToMatch, InputChain, ChainNodesMatched, false);
 
       // If the root node defines glue, we need to update it to the glue result.
       // TODO: This never happens in our tests and I think it can be removed /
       // replaced with an assert, but if we do it this the way the change is
       // NFC.
       if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) ==
               MVT::Glue &&
           InputGlue.getNode())
         CurDAG->ReplaceAllUsesOfValueWith(
             SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue);
 
       assert(NodeToMatch->use_empty() &&
              "Didn't replace all uses of the node?");
       CurDAG->RemoveDeadNode(NodeToMatch);
 
       return;
     }
     }
 
     // If the code reached this point, then the match failed.  See if there is
     // another child to try in the current 'Scope', otherwise pop it until we
     // find a case to check.
     DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
     ++NumDAGIselRetries;
     while (true) {
       if (MatchScopes.empty()) {
         CannotYetSelect(NodeToMatch);
         return;
       }
 
       // Restore the interpreter state back to the point where the scope was
       // formed.
       MatchScope &LastScope = MatchScopes.back();
       RecordedNodes.resize(LastScope.NumRecordedNodes);
       NodeStack.clear();
       NodeStack.append(LastScope.NodeStack.begin(), LastScope.NodeStack.end());
       N = NodeStack.back();
 
       if (LastScope.NumMatchedMemRefs != MatchedMemRefs.size())
         MatchedMemRefs.resize(LastScope.NumMatchedMemRefs);
       MatcherIndex = LastScope.FailIndex;
 
       DEBUG(dbgs() << "  Continuing at " << MatcherIndex << "\n");
 
       InputChain = LastScope.InputChain;
       InputGlue = LastScope.InputGlue;
       if (!LastScope.HasChainNodesMatched)
         ChainNodesMatched.clear();
 
       // Check to see what the offset is at the new MatcherIndex.  If it is zero
       // we have reached the end of this scope, otherwise we have another child
       // in the current scope to try.
       unsigned NumToSkip = MatcherTable[MatcherIndex++];
       if (NumToSkip & 128)
         NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
 
       // If we have another child in this scope to match, update FailIndex and
       // try it.
       if (NumToSkip != 0) {
         LastScope.FailIndex = MatcherIndex+NumToSkip;
         break;
       }
 
       // End of this scope, pop it and try the next child in the containing
       // scope.
       MatchScopes.pop_back();
     }
   }
 }
 
 bool SelectionDAGISel::isOrEquivalentToAdd(const SDNode *N) const {
   assert(N->getOpcode() == ISD::OR && "Unexpected opcode");
   auto *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!C)
     return false;
 
   // Detect when "or" is used to add an offset to a stack object.
   if (auto *FN = dyn_cast<FrameIndexSDNode>(N->getOperand(0))) {
     MachineFrameInfo &MFI = MF->getFrameInfo();
     unsigned A = MFI.getObjectAlignment(FN->getIndex());
     assert(isPowerOf2_32(A) && "Unexpected alignment");
     int32_t Off = C->getSExtValue();
     // If the alleged offset fits in the zero bits guaranteed by
     // the alignment, then this or is really an add.
     return (Off >= 0) && (((A - 1) & Off) == unsigned(Off));
   }
   return false;
 }
 
 void SelectionDAGISel::CannotYetSelect(SDNode *N) {
   std::string msg;
   raw_string_ostream Msg(msg);
   Msg << "Cannot select: ";
 
   if (N->getOpcode() != ISD::INTRINSIC_W_CHAIN &&
       N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
       N->getOpcode() != ISD::INTRINSIC_VOID) {
     N->printrFull(Msg, CurDAG);
     Msg << "\nIn function: " << MF->getName();
   } else {
     bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
     unsigned iid =
       cast<ConstantSDNode>(N->getOperand(HasInputChain))->getZExtValue();
     if (iid < Intrinsic::num_intrinsics)
       Msg << "intrinsic %" << Intrinsic::getName((Intrinsic::ID)iid, None);
     else if (const TargetIntrinsicInfo *TII = TM.getIntrinsicInfo())
       Msg << "target intrinsic %" << TII->getName(iid);
     else
       Msg << "unknown intrinsic #" << iid;
   }
   report_fatal_error(Msg.str());
 }
 
 char SelectionDAGISel::ID = 0;
Index: head/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
===================================================================
--- head/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp	(revision 328752)
+++ head/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp	(revision 328753)
@@ -1,1149 +1,1151 @@
 //===- TargetPassConfig.cpp - Target independent code generation passes ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines interfaces to access the target independent code
 // generation passes provided by the LLVM backend.
 //
 //===---------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
 #include "llvm/Analysis/CFLSteensAliasAnalysis.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include <cassert>
 #include <string>
 
 using namespace llvm;
 
 cl::opt<bool> EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
                          cl::desc("Enable interprocedural register allocation "
                                   "to reduce load/store at procedure calls."));
 static cl::opt<bool> DisablePostRASched("disable-post-ra", cl::Hidden,
     cl::desc("Disable Post Regalloc Scheduler"));
 static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
     cl::desc("Disable branch folding"));
 static cl::opt<bool> DisableTailDuplicate("disable-tail-duplicate", cl::Hidden,
     cl::desc("Disable tail duplication"));
 static cl::opt<bool> DisableEarlyTailDup("disable-early-taildup", cl::Hidden,
     cl::desc("Disable pre-register allocation tail duplication"));
 static cl::opt<bool> DisableBlockPlacement("disable-block-placement",
     cl::Hidden, cl::desc("Disable probability-driven block placement"));
 static cl::opt<bool> EnableBlockPlacementStats("enable-block-placement-stats",
     cl::Hidden, cl::desc("Collect probability-driven block placement stats"));
 static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
     cl::desc("Disable Stack Slot Coloring"));
 static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
     cl::desc("Disable Machine Dead Code Elimination"));
 static cl::opt<bool> DisableEarlyIfConversion("disable-early-ifcvt", cl::Hidden,
     cl::desc("Disable Early If-conversion"));
 static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
     cl::desc("Disable Machine Common Subexpression Elimination"));
 static cl::opt<cl::boolOrDefault> OptimizeRegAlloc(
     "optimize-regalloc", cl::Hidden,
     cl::desc("Enable optimized register allocation compilation path."));
 static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
     cl::Hidden,
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
     cl::desc("Disable Machine Sinking"));
 static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
     cl::desc("Disable Loop Strength Reduction Pass"));
 static cl::opt<bool> DisableConstantHoisting("disable-constant-hoisting",
     cl::Hidden, cl::desc("Disable ConstantHoisting"));
 static cl::opt<bool> DisableCGP("disable-cgp", cl::Hidden,
     cl::desc("Disable Codegen Prepare"));
 static cl::opt<bool> DisableCopyProp("disable-copyprop", cl::Hidden,
     cl::desc("Disable Copy Propagation pass"));
 static cl::opt<bool> DisablePartialLibcallInlining("disable-partial-libcall-inlining",
     cl::Hidden, cl::desc("Disable Partial Libcall Inlining"));
 static cl::opt<bool> EnableImplicitNullChecks(
     "enable-implicit-null-checks",
     cl::desc("Fold null checks into faulting memory operations"),
     cl::init(false), cl::Hidden);
 static cl::opt<bool>
     EnableMergeICmps("enable-mergeicmps",
                      cl::desc("Merge ICmp chains into a single memcmp"),
                      cl::init(false), cl::Hidden);
 static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
     cl::desc("Print LLVM IR produced by the loop-reduce pass"));
 static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
     cl::desc("Print LLVM IR input to isel pass"));
 static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
     cl::desc("Dump garbage collector data"));
 static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
     cl::init(false),
     cl::ZeroOrMore);
 static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner",
     cl::Hidden,
     cl::desc("Enable machine outliner"));
 static cl::opt<bool> EnableLinkOnceODROutlining(
     "enable-linkonceodr-outlining",
     cl::Hidden,
     cl::desc("Enable the machine outliner on linkonceodr functions"),
     cl::init(false));
 // Enable or disable FastISel. Both options are needed, because
 // FastISel is enabled by default with -fast, and we wish to be
 // able to enable or disable fast-isel independently from -O0.
 static cl::opt<cl::boolOrDefault>
 EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
 static cl::opt<cl::boolOrDefault>
     EnableGlobalISel("global-isel", cl::Hidden,
                      cl::desc("Enable the \"global\" instruction selector"));
 
 static cl::opt<std::string> PrintMachineInstrs(
     "print-machineinstrs", cl::ValueOptional, cl::desc("Print machine instrs"),
     cl::value_desc("pass-name"), cl::init("option-unspecified"), cl::Hidden);
 
 static cl::opt<int> EnableGlobalISelAbort(
     "global-isel-abort", cl::Hidden,
     cl::desc("Enable abort calls when \"global\" instruction selection "
              "fails to lower/select an instruction: 0 disable the abort, "
              "1 enable the abort, and "
              "2 disable the abort but emit a diagnostic on failure"),
     cl::init(1));
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
 // substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
 // Targets can return true in targetSchedulesPostRAScheduling() and
 // insert a PostRA scheduling pass wherever it wants.
 cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
   cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
 
 // Experimental option to run live interval analysis early.
 static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
     cl::desc("Run live interval analysis earlier in the pipeline"));
 
 // Experimental option to use CFL-AA in codegen
 enum class CFLAAType { None, Steensgaard, Andersen, Both };
 static cl::opt<CFLAAType> UseCFLAA(
     "use-cfl-aa-in-codegen", cl::init(CFLAAType::None), cl::Hidden,
     cl::desc("Enable the new, experimental CFL alias analysis in CodeGen"),
     cl::values(clEnumValN(CFLAAType::None, "none", "Disable CFL-AA"),
                clEnumValN(CFLAAType::Steensgaard, "steens",
                           "Enable unification-based CFL-AA"),
                clEnumValN(CFLAAType::Andersen, "anders",
                           "Enable inclusion-based CFL-AA"),
                clEnumValN(CFLAAType::Both, "both", 
                           "Enable both variants of CFL-AA")));
 
 /// Option names for limiting the codegen pipeline.
 /// Those are used in error reporting and we didn't want
 /// to duplicate their names all over the place.
 const char *StartAfterOptName = "start-after";
 const char *StartBeforeOptName = "start-before";
 const char *StopAfterOptName = "stop-after";
 const char *StopBeforeOptName = "stop-before";
 
 static cl::opt<std::string>
     StartAfterOpt(StringRef(StartAfterOptName),
                   cl::desc("Resume compilation after a specific pass"),
                   cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
 
 static cl::opt<std::string>
     StartBeforeOpt(StringRef(StartBeforeOptName),
                    cl::desc("Resume compilation before a specific pass"),
                    cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
 
 static cl::opt<std::string>
     StopAfterOpt(StringRef(StopAfterOptName),
                  cl::desc("Stop compilation after a specific pass"),
                  cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
 
 static cl::opt<std::string>
     StopBeforeOpt(StringRef(StopBeforeOptName),
                   cl::desc("Stop compilation before a specific pass"),
                   cl::value_desc("pass-name"), cl::init(""), cl::Hidden);
 
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
 /// These should be converted to boolOrDefault in order to use applyOverride.
 static IdentifyingPassPtr applyDisable(IdentifyingPassPtr PassID,
                                        bool Override) {
   if (Override)
     return IdentifyingPassPtr();
   return PassID;
 }
 
 /// Allow standard passes to be disabled by the command line, regardless of who
 /// is adding the pass.
 ///
 /// StandardID is the pass identified in the standard pass pipeline and provided
 /// to addPass(). It may be a target-specific ID in the case that the target
 /// directly adds its own pass, but in that case we harmlessly fall through.
 ///
 /// TargetID is the pass that the target has configured to override StandardID.
 ///
 /// StandardID may be a pseudo ID. In that case TargetID is the name of the real
 /// pass to run. This allows multiple options to control a single pass depending
 /// on where in the pipeline that pass is added.
 static IdentifyingPassPtr overridePass(AnalysisID StandardID,
                                        IdentifyingPassPtr TargetID) {
   if (StandardID == &PostRASchedulerID)
     return applyDisable(TargetID, DisablePostRASched);
 
   if (StandardID == &BranchFolderPassID)
     return applyDisable(TargetID, DisableBranchFold);
 
   if (StandardID == &TailDuplicateID)
     return applyDisable(TargetID, DisableTailDuplicate);
 
   if (StandardID == &TargetPassConfig::EarlyTailDuplicateID)
     return applyDisable(TargetID, DisableEarlyTailDup);
 
   if (StandardID == &MachineBlockPlacementID)
     return applyDisable(TargetID, DisableBlockPlacement);
 
   if (StandardID == &StackSlotColoringID)
     return applyDisable(TargetID, DisableSSC);
 
   if (StandardID == &DeadMachineInstructionElimID)
     return applyDisable(TargetID, DisableMachineDCE);
 
   if (StandardID == &EarlyIfConverterID)
     return applyDisable(TargetID, DisableEarlyIfConversion);
 
   if (StandardID == &MachineLICMID)
     return applyDisable(TargetID, DisableMachineLICM);
 
   if (StandardID == &MachineCSEID)
     return applyDisable(TargetID, DisableMachineCSE);
 
   if (StandardID == &TargetPassConfig::PostRAMachineLICMID)
     return applyDisable(TargetID, DisablePostRAMachineLICM);
 
   if (StandardID == &MachineSinkingID)
     return applyDisable(TargetID, DisableMachineSink);
 
   if (StandardID == &MachineCopyPropagationID)
     return applyDisable(TargetID, DisableCopyProp);
 
   return TargetID;
 }
 
 //===---------------------------------------------------------------------===//
 /// TargetPassConfig
 //===---------------------------------------------------------------------===//
 
 INITIALIZE_PASS(TargetPassConfig, "targetpassconfig",
                 "Target Pass Configuration", false, false)
 char TargetPassConfig::ID = 0;
 
 // Pseudo Pass IDs.
 char TargetPassConfig::EarlyTailDuplicateID = 0;
 char TargetPassConfig::PostRAMachineLICMID = 0;
 
 namespace {
 
 struct InsertedPass {
   AnalysisID TargetPassID;
   IdentifyingPassPtr InsertedPassID;
   bool VerifyAfter;
   bool PrintAfter;
 
   InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
                bool VerifyAfter, bool PrintAfter)
       : TargetPassID(TargetPassID), InsertedPassID(InsertedPassID),
         VerifyAfter(VerifyAfter), PrintAfter(PrintAfter) {}
 
   Pass *getInsertedPass() const {
     assert(InsertedPassID.isValid() && "Illegal Pass ID!");
     if (InsertedPassID.isInstance())
       return InsertedPassID.getInstance();
     Pass *NP = Pass::createPass(InsertedPassID.getID());
     assert(NP && "Pass ID not registered");
     return NP;
   }
 };
 
 } // end anonymous namespace
 
 namespace llvm {
 
 class PassConfigImpl {
 public:
   // List of passes explicitly substituted by this target. Normally this is
   // empty, but it is a convenient way to suppress or replace specific passes
   // that are part of a standard pass pipeline without overridding the entire
   // pipeline. This mechanism allows target options to inherit a standard pass's
   // user interface. For example, a target may disable a standard pass by
   // default by substituting a pass ID of zero, and the user may still enable
   // that standard pass with an explicit command line option.
   DenseMap<AnalysisID,IdentifyingPassPtr> TargetPasses;
 
   /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass
   /// is inserted after each instance of the first one.
   SmallVector<InsertedPass, 4> InsertedPasses;
 };
 
 } // end namespace llvm
 
 // Out of line virtual method.
 TargetPassConfig::~TargetPassConfig() {
   delete Impl;
 }
 
 static const PassInfo *getPassInfo(StringRef PassName) {
   if (PassName.empty())
     return nullptr;
 
   const PassRegistry &PR = *PassRegistry::getPassRegistry();
   const PassInfo *PI = PR.getPassInfo(PassName);
   if (!PI)
     report_fatal_error(Twine('\"') + Twine(PassName) +
                        Twine("\" pass is not registered."));
   return PI;
 }
 
 static AnalysisID getPassIDFromName(StringRef PassName) {
   const PassInfo *PI = getPassInfo(PassName);
   return PI ? PI->getTypeInfo() : nullptr;
 }
 
 void TargetPassConfig::setStartStopPasses() {
   StartBefore = getPassIDFromName(StartBeforeOpt);
   StartAfter = getPassIDFromName(StartAfterOpt);
   StopBefore = getPassIDFromName(StopBeforeOpt);
   StopAfter = getPassIDFromName(StopAfterOpt);
   if (StartBefore && StartAfter)
     report_fatal_error(Twine(StartBeforeOptName) + Twine(" and ") +
                        Twine(StartAfterOptName) + Twine(" specified!"));
   if (StopBefore && StopAfter)
     report_fatal_error(Twine(StopBeforeOptName) + Twine(" and ") +
                        Twine(StopAfterOptName) + Twine(" specified!"));
   Started = (StartAfter == nullptr) && (StartBefore == nullptr);
 }
 
 // Out of line constructor provides default values for pass options and
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(LLVMTargetMachine &TM, PassManagerBase &pm)
     : ImmutablePass(ID), PM(&pm), TM(&TM) {
   Impl = new PassConfigImpl();
 
   // Register all target independent codegen passes to activate their PassIDs,
   // including this pass itself.
   initializeCodeGen(*PassRegistry::getPassRegistry());
 
   // Also register alias analysis passes required by codegen passes.
   initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
   initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
 
   // Substitute Pseudo Pass IDs for real ones.
   substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
   substitutePass(&PostRAMachineLICMID, &MachineLICMID);
 
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
     TM.Options.PrintMachineCode = true;
 
   if (EnableIPRA.getNumOccurrences())
     TM.Options.EnableIPRA = EnableIPRA;
   else {
     // If not explicitly specified, use target default.
     TM.Options.EnableIPRA = TM.useIPRA();
   }
 
   if (TM.Options.EnableIPRA)
     setRequiresCodeGenSCCOrder();
 
   setStartStopPasses();
 }
 
 CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
   return TM->getOptLevel();
 }
 
 /// Insert InsertedPassID pass after TargetPassID.
 void TargetPassConfig::insertPass(AnalysisID TargetPassID,
                                   IdentifyingPassPtr InsertedPassID,
                                   bool VerifyAfter, bool PrintAfter) {
   assert(((!InsertedPassID.isInstance() &&
            TargetPassID != InsertedPassID.getID()) ||
           (InsertedPassID.isInstance() &&
            TargetPassID != InsertedPassID.getInstance()->getPassID())) &&
          "Insert a pass after itself!");
   Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID, VerifyAfter,
                                     PrintAfter);
 }
 
 /// createPassConfig - Create a pass configuration object to be used by
 /// addPassToEmitX methods for generating a pipeline of CodeGen passes.
 ///
 /// Targets may override this to extend TargetPassConfig.
 TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new TargetPassConfig(*this, PM);
 }
 
 TargetPassConfig::TargetPassConfig()
   : ImmutablePass(ID) {
   report_fatal_error("Trying to construct TargetPassConfig without a target "
                      "machine. Scheduling a CodeGen pass without a target "
                      "triple set?");
 }
 
 bool TargetPassConfig::hasLimitedCodeGenPipeline() const {
   return StartBefore || StartAfter || StopBefore || StopAfter;
 }
 
 std::string
 TargetPassConfig::getLimitedCodeGenPipelineReason(const char *Separator) const {
   if (!hasLimitedCodeGenPipeline())
     return std::string();
   std::string Res;
   static cl::opt<std::string> *PassNames[] = {&StartAfterOpt, &StartBeforeOpt,
                                               &StopAfterOpt, &StopBeforeOpt};
   static const char *OptNames[] = {StartAfterOptName, StartBeforeOptName,
                                    StopAfterOptName, StopBeforeOptName};
   bool IsFirst = true;
   for (int Idx = 0; Idx < 4; ++Idx)
     if (!PassNames[Idx]->empty()) {
       if (!IsFirst)
         Res += Separator;
       IsFirst = false;
       Res += OptNames[Idx];
     }
   return Res;
 }
 
 // Helper to verify the analysis is really immutable.
 void TargetPassConfig::setOpt(bool &Opt, bool Val) {
   assert(!Initialized && "PassConfig is immutable");
   Opt = Val;
 }
 
 void TargetPassConfig::substitutePass(AnalysisID StandardID,
                                       IdentifyingPassPtr TargetID) {
   Impl->TargetPasses[StandardID] = TargetID;
 }
 
 IdentifyingPassPtr TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
   DenseMap<AnalysisID, IdentifyingPassPtr>::const_iterator
     I = Impl->TargetPasses.find(ID);
   if (I == Impl->TargetPasses.end())
     return ID;
   return I->second;
 }
 
 bool TargetPassConfig::isPassSubstitutedOrOverridden(AnalysisID ID) const {
   IdentifyingPassPtr TargetID = getPassSubstitution(ID);
   IdentifyingPassPtr FinalPtr = overridePass(ID, TargetID);
   return !FinalPtr.isValid() || FinalPtr.isInstance() ||
       FinalPtr.getID() != ID;
 }
 
 /// Add a pass to the PassManager if that pass is supposed to be run.  If the
 /// Started/Stopped flags indicate either that the compilation should start at
 /// a later pass or that it should stop after an earlier pass, then do not add
 /// the pass.  Finally, compare the current pass against the StartAfter
 /// and StopAfter options and change the Started/Stopped flags accordingly.
 void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
   assert(!Initialized && "PassConfig is immutable");
 
   // Cache the Pass ID here in case the pass manager finds this pass is
   // redundant with ones already scheduled / available, and deletes it.
   // Fundamentally, once we add the pass to the manager, we no longer own it
   // and shouldn't reference it.
   AnalysisID PassID = P->getPassID();
 
   if (StartBefore == PassID)
     Started = true;
   if (StopBefore == PassID)
     Stopped = true;
   if (Started && !Stopped) {
     std::string Banner;
     // Construct banner message before PM->add() as that may delete the pass.
     if (AddingMachinePasses && (printAfter || verifyAfter))
       Banner = std::string("After ") + std::string(P->getPassName());
     PM->add(P);
     if (AddingMachinePasses) {
       if (printAfter)
         addPrintPass(Banner);
       if (verifyAfter)
         addVerifyPass(Banner);
     }
 
     // Add the passes after the pass P if there is any.
     for (auto IP : Impl->InsertedPasses) {
       if (IP.TargetPassID == PassID)
         addPass(IP.getInsertedPass(), IP.VerifyAfter, IP.PrintAfter);
     }
   } else {
     delete P;
   }
   if (StopAfter == PassID)
     Stopped = true;
   if (StartAfter == PassID)
     Started = true;
   if (Stopped && !Started)
     report_fatal_error("Cannot stop compilation after pass that is not run");
 }
 
 /// Add a CodeGen pass at this point in the pipeline after checking for target
 /// and command line overrides.
 ///
 /// addPass cannot return a pointer to the pass instance because is internal the
 /// PassManager and the instance we create here may already be freed.
 AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter,
                                      bool printAfter) {
   IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
   IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
   if (!FinalPtr.isValid())
     return nullptr;
 
   Pass *P;
   if (FinalPtr.isInstance())
     P = FinalPtr.getInstance();
   else {
     P = Pass::createPass(FinalPtr.getID());
     if (!P)
       llvm_unreachable("Pass ID not registered");
   }
   AnalysisID FinalID = P->getPassID();
   addPass(P, verifyAfter, printAfter); // Ends the lifetime of P.
 
   return FinalID;
 }
 
 void TargetPassConfig::printAndVerify(const std::string &Banner) {
   addPrintPass(Banner);
   addVerifyPass(Banner);
 }
 
 void TargetPassConfig::addPrintPass(const std::string &Banner) {
   if (TM->shouldPrintMachineCode())
     PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
 }
 
 void TargetPassConfig::addVerifyPass(const std::string &Banner) {
   bool Verify = VerifyMachineCode;
 #ifdef EXPENSIVE_CHECKS
   if (VerifyMachineCode == cl::BOU_UNSET)
     Verify = TM->isMachineVerifierClean();
 #endif
   if (Verify)
     PM->add(createMachineVerifierPass(Banner));
 }
 
 /// Add common target configurable passes that perform LLVM IR to IR transforms
 /// following machine independent optimization.
 void TargetPassConfig::addIRPasses() {
   switch (UseCFLAA) {
   case CFLAAType::Steensgaard:
     addPass(createCFLSteensAAWrapperPass());
     break;
   case CFLAAType::Andersen:
     addPass(createCFLAndersAAWrapperPass());
     break;
   case CFLAAType::Both:
     addPass(createCFLAndersAAWrapperPass());
     addPass(createCFLSteensAAWrapperPass());
     break;
   default:
     break;
   }
 
   // Basic AliasAnalysis support.
   // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
   // BasicAliasAnalysis wins if they disagree. This is intended to help
   // support "obvious" type-punning idioms.
   addPass(createTypeBasedAAWrapperPass());
   addPass(createScopedNoAliasAAWrapperPass());
   addPass(createBasicAAWrapperPass());
 
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
   if (!DisableVerify)
     addPass(createVerifierPass());
 
   // Run loop strength reduction before anything else.
   if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
     addPass(createLoopStrengthReducePass());
     if (PrintLSR)
       addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
   if (getOptLevel() != CodeGenOpt::None) {
     // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
     // loads and compares. ExpandMemCmpPass then tries to expand those calls
     // into optimally-sized loads and compares. The transforms are enabled by a
     // target lowering hook.
     if (EnableMergeICmps)
       addPass(createMergeICmpsPass());
     addPass(createExpandMemCmpPass());
   }
 
   // Run GC lowering passes for builtin collectors
   // TODO: add a pass insertion point here
   addPass(createGCLoweringPass());
   addPass(createShadowStackGCLoweringPass());
 
   // Make sure that no unreachable blocks are instruction selected.
   addPass(createUnreachableBlockEliminationPass());
 
   // Prepare expensive constants for SelectionDAG.
   if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting)
     addPass(createConstantHoistingPass());
 
   if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
     addPass(createPartiallyInlineLibCallsPass());
 
   // Instrument function entry and exit, e.g. with calls to mcount().
   addPass(createPostInlineEntryExitInstrumenterPass());
 
   // Add scalarization of target's unsupported masked memory intrinsics pass.
   // the unsupported intrinsic will be replaced with a chain of basic blocks,
   // that stores/loads element one-by-one if the appropriate mask bit is set.
   addPass(createScalarizeMaskedMemIntrinPass());
 
   // Expand reduction intrinsics into shuffle sequences if the target wants to.
   addPass(createExpandReductionsPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
 /// handle.
 void TargetPassConfig::addPassesToHandleExceptions() {
   const MCAsmInfo *MCAI = TM->getMCAsmInfo();
   assert(MCAI && "No MCAsmInfo");
   switch (MCAI->getExceptionHandlingType()) {
   case ExceptionHandling::SjLj:
     // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
     // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
     // catch info can get misplaced when a selector ends up more than one block
     // removed from the parent invoke(s). This could happen when a landing
     // pad is shared by multiple invokes and is also a target of a normal
     // edge from elsewhere.
     addPass(createSjLjEHPreparePass());
     LLVM_FALLTHROUGH;
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
     addPass(createDwarfEHPass());
     break;
   case ExceptionHandling::WinEH:
     // We support using both GCC-style and MSVC-style exceptions on Windows, so
     // add both preparation passes. Each pass will only actually run if it
     // recognizes the personality function.
     addPass(createWinEHPass());
     addPass(createDwarfEHPass());
     break;
   case ExceptionHandling::None:
     addPass(createLowerInvokePass());
 
     // The lower invoke pass may create unreachable code. Remove it.
     addPass(createUnreachableBlockEliminationPass());
     break;
   }
 }
 
 /// Add pass to prepare the LLVM IR for code generation. This should be done
 /// before exception handling preparation passes.
 void TargetPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
     addPass(createCodeGenPreparePass());
   addPass(createRewriteSymbolsPass());
 }
 
 /// Add common passes that perform LLVM IR to IR transforms in preparation for
 /// instruction selection.
 void TargetPassConfig::addISelPrepare() {
   addPreISel();
 
   // Force codegen to run according to the callgraph.
   if (requiresCodeGenSCCOrder())
     addPass(new DummyCGSCCPass);
 
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
   addPass(createSafeStackPass());
   addPass(createStackProtectorPass());
 
   if (PrintISelInput)
     addPass(createPrintFunctionPass(
         dbgs(), "\n\n*** Final LLVM Code input to ISel ***\n"));
 
   // All passes which modify the LLVM IR are now complete; run the verifier
   // to ensure that the IR is valid.
   if (!DisableVerify)
     addPass(createVerifierPass());
 }
 
 bool TargetPassConfig::addCoreISelPasses() {
   // Enable FastISel with -fast, but allow that to be overridden.
   TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
   if (EnableFastISelOption == cl::BOU_TRUE ||
       (TM->getOptLevel() == CodeGenOpt::None && TM->getO0WantsFastISel()))
     TM->setFastISel(true);
 
   // Ask the target for an isel.
   // Enable GlobalISel if the target wants to, but allow that to be overriden.
   // Explicitly enabling fast-isel should override implicitly enabled
   // global-isel.
   if (EnableGlobalISel == cl::BOU_TRUE ||
       (EnableGlobalISel == cl::BOU_UNSET && isGlobalISelEnabled() &&
        EnableFastISelOption != cl::BOU_TRUE)) {
+    TM->setFastISel(false);
+
     if (addIRTranslator())
       return true;
 
     addPreLegalizeMachineIR();
 
     if (addLegalizeMachineIR())
       return true;
 
     // Before running the register bank selector, ask the target if it
     // wants to run some passes.
     addPreRegBankSelect();
 
     if (addRegBankSelect())
       return true;
 
     addPreGlobalInstructionSelect();
 
     if (addGlobalInstructionSelect())
       return true;
 
     // Pass to reset the MachineFunction if the ISel failed.
     addPass(createResetMachineFunctionPass(
         reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));
 
     // Provide a fallback path when we do not want to abort on
     // not-yet-supported input.
     if (!isGlobalISelAbortEnabled() && addInstSelector())
       return true;
 
   } else if (addInstSelector())
     return true;
 
   return false;
 }
 
 bool TargetPassConfig::addISelPasses() {
   if (TM->Options.EmulatedTLS)
     addPass(createLowerEmuTLSPass());
 
   addPass(createPreISelIntrinsicLoweringPass());
   addPass(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
   addIRPasses();
   addCodeGenPrepare();
   addPassesToHandleExceptions();
   addISelPrepare();
 
   return addCoreISelPasses();
 }
 
 /// -regalloc=... command line option.
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<RegisterRegAlloc>>
     RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
              cl::desc("Register allocator to use"));
 
 /// Add the complete set of target-independent postISel code generator passes.
 ///
 /// This can be read as the standard order of major LLVM CodeGen stages. Stages
 /// with nontrivial configuration or multiple passes are broken out below in
 /// add%Stage routines.
 ///
 /// Any TargetPassConfig::addXX routine may be overriden by the Target. The
 /// addPre/Post methods with empty header implementations allow injecting
 /// target-specific fixups just before or after major stages. Additionally,
 /// targets have the flexibility to change pass order within a stage by
 /// overriding default implementation of add%Stage routines below. Each
 /// technique has maintainability tradeoffs because alternate pass orders are
 /// not well supported. addPre/Post works better if the target pass is easily
 /// tied to a common pass. But if it has subtle dependencies on multiple passes,
 /// the target should override the stage instead.
 ///
 /// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
 /// before/after any target-independent pass. But it's currently overkill.
 void TargetPassConfig::addMachinePasses() {
   AddingMachinePasses = true;
 
   // Insert a machine instr printer pass after the specified pass.
   if (!StringRef(PrintMachineInstrs.getValue()).equals("") &&
       !StringRef(PrintMachineInstrs.getValue()).equals("option-unspecified")) {
     const PassRegistry *PR = PassRegistry::getPassRegistry();
     const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
     const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
     assert (TPI && IPI && "Pass ID not registered!");
     const char *TID = (const char *)(TPI->getTypeInfo());
     const char *IID = (const char *)(IPI->getTypeInfo());
     insertPass(TID, IID);
   }
 
   // Print the instruction selected machine code...
   printAndVerify("After Instruction Selection");
 
   // Expand pseudo-instructions emitted by ISel.
   addPass(&ExpandISelPseudosID);
 
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
     addMachineSSAOptimization();
   } else {
     // If the target requests it, assign local variables to stack slots relative
     // to one another and simplify frame index references where possible.
     addPass(&LocalStackSlotAllocationID, false);
   }
 
   if (TM->Options.EnableIPRA)
     addPass(createRegUsageInfoPropPass());
 
   // Run pre-ra passes.
   addPreRegAlloc();
 
   // Run register allocation and passes that are tightly coupled with it,
   // including phi elimination and scheduling.
   if (getOptimizeRegAlloc())
     addOptimizedRegAlloc(createRegAllocPass(true));
   else {
     if (RegAlloc != &useDefaultRegisterAllocator &&
         RegAlloc != &createFastRegisterAllocator)
       report_fatal_error("Must use fast (default) register allocator for unoptimized regalloc.");
     addFastRegAlloc(createRegAllocPass(false));
   }
 
   // Run post-ra passes.
   addPostRegAlloc();
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&ShrinkWrapID);
 
   // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only
   // do so if it hasn't been disabled, substituted, or overridden.
   if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID))
       addPass(createPrologEpilogInserterPass());
 
   /// Add passes that optimize machine instructions after register allocation.
   if (getOptLevel() != CodeGenOpt::None)
     addMachineLateOptimization();
 
   // Expand pseudo instructions before second scheduling pass.
   addPass(&ExpandPostRAPseudosID);
 
   // Run pre-sched2 passes.
   addPreSched2();
 
   if (EnableImplicitNullChecks)
     addPass(&ImplicitNullChecksID);
 
   // Second pass scheduler.
   // Let Target optionally insert this pass by itself at some other
   // point.
   if (getOptLevel() != CodeGenOpt::None &&
       !TM->targetSchedulesPostRAScheduling()) {
     if (MISchedPostRA)
       addPass(&PostMachineSchedulerID);
     else
       addPass(&PostRASchedulerID);
   }
 
   // GC
   if (addGCPasses()) {
     if (PrintGCInfo)
       addPass(createGCInfoPrinter(dbgs()), false, false);
   }
 
   // Basic block placement.
   if (getOptLevel() != CodeGenOpt::None)
     addBlockPlacement();
 
   addPreEmitPass();
 
   if (TM->Options.EnableIPRA)
     // Collect register usage information and produce a register mask of
     // clobbered registers, to be used to optimize call sites.
     addPass(createRegUsageInfoCollector());
 
   addPass(&FuncletLayoutID, false);
 
   addPass(&StackMapLivenessID, false);
   addPass(&LiveDebugValuesID, false);
 
   // Insert before XRay Instrumentation.
   addPass(&FEntryInserterID, false);
 
   addPass(&XRayInstrumentationID, false);
   addPass(&PatchableFunctionID, false);
 
   if (EnableMachineOutliner)
     PM->add(createMachineOutlinerPass(EnableLinkOnceODROutlining));
 
   AddingMachinePasses = false;
 }
 
 /// Add passes that optimize machine instructions in SSA form.
 void TargetPassConfig::addMachineSSAOptimization() {
   // Pre-ra tail duplication.
   addPass(&EarlyTailDuplicateID);
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
   addPass(&OptimizePHIsID, false);
 
   // This pass merges large allocas. StackSlotColoring is a different pass
   // which merges spill slots.
   addPass(&StackColoringID, false);
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
   addPass(&LocalStackSlotAllocationID, false);
 
   // With optimization, dead code should already be eliminated. However
   // there is one known exception: lowered code for arguments that are only
   // used by tail calls, where the tail calls reuse the incoming stack
   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
   addPass(&DeadMachineInstructionElimID);
 
   // Allow targets to insert passes that improve instruction level parallelism,
   // like if-conversion. Such passes will typically need dominator trees and
   // loop info, just like LICM and CSE below.
   addILPOpts();
 
   addPass(&MachineLICMID, false);
   addPass(&MachineCSEID, false);
 
   addPass(&MachineSinkingID);
 
   addPass(&PeepholeOptimizerID);
   // Clean-up the dead code that may have been generated by peephole
   // rewriting.
   addPass(&DeadMachineInstructionElimID);
 }
 
 //===---------------------------------------------------------------------===//
 /// Register Allocation Pass Configuration
 //===---------------------------------------------------------------------===//
 
 bool TargetPassConfig::getOptimizeRegAlloc() const {
   switch (OptimizeRegAlloc) {
   case cl::BOU_UNSET: return getOptLevel() != CodeGenOpt::None;
   case cl::BOU_TRUE:  return true;
   case cl::BOU_FALSE: return false;
   }
   llvm_unreachable("Invalid optimize-regalloc state");
 }
 
 /// RegisterRegAlloc's global Registry tracks allocator registration.
 MachinePassRegistry RegisterRegAlloc::Registry;
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
 static llvm::once_flag InitializeDefaultRegisterAllocatorFlag;
 
 static RegisterRegAlloc
 defaultRegAlloc("default",
                 "pick register allocator based on -O option",
                 useDefaultRegisterAllocator);
 
 static void initializeDefaultRegisterAllocatorOnce() {
   RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
 
   if (!Ctor) {
     Ctor = RegAlloc;
     RegisterRegAlloc::setDefault(RegAlloc);
   }
 }
 
 /// Instantiate the default register allocator pass for this target for either
 /// the optimized or unoptimized allocation path. This will be added to the pass
 /// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc
 /// in the optimized case.
 ///
 /// A target that uses the standard regalloc pass order for fast or optimized
 /// allocation may still override this for per-target regalloc
 /// selection. But -regalloc=... always takes precedence.
 FunctionPass *TargetPassConfig::createTargetRegisterAllocator(bool Optimized) {
   if (Optimized)
     return createGreedyRegisterAllocator();
   else
     return createFastRegisterAllocator();
 }
 
 /// Find and instantiate the register allocation pass requested by this target
 /// at the current optimization level.  Different register allocators are
 /// defined as separate passes because they may require different analysis.
 ///
 /// This helper ensures that the regalloc= option is always available,
 /// even for targets that override the default allocator.
 ///
 /// FIXME: When MachinePassRegistry register pass IDs instead of function ptrs,
 /// this can be folded into addPass.
 FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
   // Initialize the global default.
   llvm::call_once(InitializeDefaultRegisterAllocatorFlag,
                   initializeDefaultRegisterAllocatorOnce);
 
   RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
   if (Ctor != useDefaultRegisterAllocator)
     return Ctor();
 
   // With no -regalloc= override, ask the target for a regalloc pass.
   return createTargetRegisterAllocator(Optimized);
 }
 
 /// Return true if the default global register allocator is in use and
 /// has not be overriden on the command line with '-regalloc=...'
 bool TargetPassConfig::usingDefaultRegAlloc() const {
   return RegAlloc.getNumOccurrences() == 0;
 }
 
 /// Add the minimum set of target-independent passes that are required for
 /// register allocation. No coalescing or scheduling.
 void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
   addPass(&PHIEliminationID, false);
   addPass(&TwoAddressInstructionPassID, false);
 
   if (RegAllocPass)
     addPass(RegAllocPass);
 }
 
 /// Add standard target-independent passes that are tightly coupled with
 /// optimized register allocation, including coalescing, machine instruction
 /// scheduling, and register allocation itself.
 void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   addPass(&DetectDeadLanesID, false);
 
   addPass(&ProcessImplicitDefsID, false);
 
   // LiveVariables currently requires pure SSA form.
   //
   // FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
   // LiveVariables can be removed completely, and LiveIntervals can be directly
   // computed. (We still either need to regenerate kill flags after regalloc, or
   // preferably fix the scavenger to not depend on them).
   addPass(&LiveVariablesID, false);
 
   // Edge splitting is smarter with machine loop info.
   addPass(&MachineLoopInfoID, false);
   addPass(&PHIEliminationID, false);
 
   // Eventually, we want to run LiveIntervals before PHI elimination.
   if (EarlyLiveIntervals)
     addPass(&LiveIntervalsID, false);
 
   addPass(&TwoAddressInstructionPassID, false);
   addPass(&RegisterCoalescerID);
 
   // The machine scheduler may accidentally create disconnected components
   // when moving subregister definitions around, avoid this by splitting them to
   // separate vregs before. Splitting can also improve reg. allocation quality.
   addPass(&RenameIndependentSubregsID);
 
   // PreRA instruction scheduling.
   addPass(&MachineSchedulerID);
 
   if (RegAllocPass) {
     // Add the selected register allocation pass.
     addPass(RegAllocPass);
 
     // Allow targets to change the register assignments before rewriting.
     addPreRewrite();
 
     // Finally rewrite virtual registers.
     addPass(&VirtRegRewriterID);
 
     // Perform stack slot coloring and post-ra machine LICM.
     //
     // FIXME: Re-enable coloring with register when it's capable of adding
     // kill markers.
     addPass(&StackSlotColoringID);
 
     // Run post-ra machine LICM to hoist reloads / remats.
     //
     // FIXME: can this move into MachineLateOptimization?
     addPass(&PostRAMachineLICMID);
   }
 }
 
 //===---------------------------------------------------------------------===//
 /// Post RegAlloc Pass Configuration
 //===---------------------------------------------------------------------===//
 
 /// Add passes that optimize machine instructions after register allocation.
 void TargetPassConfig::addMachineLateOptimization() {
   // Branch folding must be run after regalloc and prolog/epilog insertion.
   addPass(&BranchFolderPassID);
 
   // Tail duplication.
   // Note that duplicating tail just increases code size and degrades
   // performance for targets that require Structured Control Flow.
   // In addition it can also make CFG irreducible. Thus we disable it.
   if (!TM->requiresStructuredCFG())
     addPass(&TailDuplicateID);
 
   // Copy propagation.
   addPass(&MachineCopyPropagationID);
 }
 
 /// Add standard GC passes.
 bool TargetPassConfig::addGCPasses() {
   addPass(&GCMachineCodeAnalysisID, false);
   return true;
 }
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
   if (addPass(&MachineBlockPlacementID)) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
       addPass(&MachineBlockPlacementStatsID);
   }
 }
 
 //===---------------------------------------------------------------------===//
 /// GlobalISel Configuration
 //===---------------------------------------------------------------------===//
 
 bool TargetPassConfig::isGlobalISelEnabled() const {
   return false;
 }
 
 bool TargetPassConfig::isGlobalISelAbortEnabled() const {
   if (EnableGlobalISelAbort.getNumOccurrences() > 0)
     return EnableGlobalISelAbort == 1;
 
   // When no abort behaviour is specified, we don't abort if the target says
   // that GISel is enabled.
   return !isGlobalISelEnabled();
 }
 
 bool TargetPassConfig::reportDiagnosticWhenGlobalISelFallback() const {
   return EnableGlobalISelAbort == 2;
 }
Index: head/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp	(revision 328753)
@@ -1,5153 +1,5154 @@
 //===- AArch6464FastISel.cpp - AArch64 FastISel implementation ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the AArch64-specific support for the FastISel class. Some
 // of the target-specific code is generated by tablegen in the file
 // AArch64GenFastISel.inc, which is #included here.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
 #include "AArch64CallingConvention.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <utility>
 
 using namespace llvm;
 
 namespace {
 
 class AArch64FastISel final : public FastISel {
   class Address {
   public:
     using BaseKind = enum {
       RegBase,
       FrameIndexBase
     };
 
   private:
     BaseKind Kind = RegBase;
     AArch64_AM::ShiftExtendType ExtType = AArch64_AM::InvalidShiftExtend;
     union {
       unsigned Reg;
       int FI;
     } Base;
     unsigned OffsetReg = 0;
     unsigned Shift = 0;
     int64_t Offset = 0;
     const GlobalValue *GV = nullptr;
 
   public:
     Address() { Base.Reg = 0; }
 
     void setKind(BaseKind K) { Kind = K; }
     BaseKind getKind() const { return Kind; }
     void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
     AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
     bool isRegBase() const { return Kind == RegBase; }
     bool isFIBase() const { return Kind == FrameIndexBase; }
 
     void setReg(unsigned Reg) {
       assert(isRegBase() && "Invalid base register access!");
       Base.Reg = Reg;
     }
 
     unsigned getReg() const {
       assert(isRegBase() && "Invalid base register access!");
       return Base.Reg;
     }
 
     void setOffsetReg(unsigned Reg) {
       OffsetReg = Reg;
     }
 
     unsigned getOffsetReg() const {
       return OffsetReg;
     }
 
     void setFI(unsigned FI) {
       assert(isFIBase() && "Invalid base frame index  access!");
       Base.FI = FI;
     }
 
     unsigned getFI() const {
       assert(isFIBase() && "Invalid base frame index access!");
       return Base.FI;
     }
 
     void setOffset(int64_t O) { Offset = O; }
     int64_t getOffset() { return Offset; }
     void setShift(unsigned S) { Shift = S; }
     unsigned getShift() { return Shift; }
 
     void setGlobalValue(const GlobalValue *G) { GV = G; }
     const GlobalValue *getGlobalValue() { return GV; }
   };
 
   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
   LLVMContext *Context;
 
   bool fastLowerArguments() override;
   bool fastLowerCall(CallLoweringInfo &CLI) override;
   bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
 
 private:
   // Selection routines.
   bool selectAddSub(const Instruction *I);
   bool selectLogicalOp(const Instruction *I);
   bool selectLoad(const Instruction *I);
   bool selectStore(const Instruction *I);
   bool selectBranch(const Instruction *I);
   bool selectIndirectBr(const Instruction *I);
   bool selectCmp(const Instruction *I);
   bool selectSelect(const Instruction *I);
   bool selectFPExt(const Instruction *I);
   bool selectFPTrunc(const Instruction *I);
   bool selectFPToInt(const Instruction *I, bool Signed);
   bool selectIntToFP(const Instruction *I, bool Signed);
   bool selectRem(const Instruction *I, unsigned ISDOpcode);
   bool selectRet(const Instruction *I);
   bool selectTrunc(const Instruction *I);
   bool selectIntExt(const Instruction *I);
   bool selectMul(const Instruction *I);
   bool selectShift(const Instruction *I);
   bool selectBitCast(const Instruction *I);
   bool selectFRem(const Instruction *I);
   bool selectSDiv(const Instruction *I);
   bool selectGetElementPtr(const Instruction *I);
   bool selectAtomicCmpXchg(const AtomicCmpXchgInst *I);
 
   // Utility helper routines.
   bool isTypeLegal(Type *Ty, MVT &VT);
   bool isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed = false);
   bool isValueAvailable(const Value *V) const;
   bool computeAddress(const Value *Obj, Address &Addr, Type *Ty = nullptr);
   bool computeCallAddress(const Value *V, Address &Addr);
   bool simplifyAddress(Address &Addr, MVT VT);
   void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
                             MachineMemOperand::Flags Flags,
                             unsigned ScaleFactor, MachineMemOperand *MMO);
   bool isMemCpySmall(uint64_t Len, unsigned Alignment);
   bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
                           unsigned Alignment);
   bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I,
                          const Value *Cond);
   bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT);
   bool optimizeSelect(const SelectInst *SI);
   std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx);
 
   // Emit helper routines.
   unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
                       const Value *RHS, bool SetFlags = false,
                       bool WantResult = true,  bool IsZExt = false);
   unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
                          bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
                          bool SetFlags = false, bool WantResult = true);
   unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
                          bool LHSIsKill, uint64_t Imm, bool SetFlags = false,
                          bool WantResult = true);
   unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
                          bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
                          AArch64_AM::ShiftExtendType ShiftType,
                          uint64_t ShiftImm, bool SetFlags = false,
                          bool WantResult = true);
   unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
                          bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
                           AArch64_AM::ShiftExtendType ExtType,
                           uint64_t ShiftImm, bool SetFlags = false,
                          bool WantResult = true);
 
   // Emit functions.
   bool emitCompareAndBranch(const BranchInst *BI);
   bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt);
   bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt);
   bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
   bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS);
   unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true,
                     MachineMemOperand *MMO = nullptr);
   bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
                  MachineMemOperand *MMO = nullptr);
   bool emitStoreRelease(MVT VT, unsigned SrcReg, unsigned AddrReg,
                         MachineMemOperand *MMO = nullptr);
   unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
   unsigned emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
   unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
                    bool SetFlags = false, bool WantResult = true,
                    bool IsZExt = false);
   unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm);
   unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
                    bool SetFlags = false, bool WantResult = true,
                    bool IsZExt = false);
   unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
                        unsigned RHSReg, bool RHSIsKill, bool WantResult = true);
   unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
                        unsigned RHSReg, bool RHSIsKill,
                        AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm,
                        bool WantResult = true);
   unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
                          const Value *RHS);
   unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
                             bool LHSIsKill, uint64_t Imm);
   unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
                             bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
                             uint64_t ShiftImm);
   unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
   unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
                       unsigned Op1, bool Op1IsKill);
   unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
                         unsigned Op1, bool Op1IsKill);
   unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
                         unsigned Op1, bool Op1IsKill);
   unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
                       unsigned Op1Reg, bool Op1IsKill);
   unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
                       uint64_t Imm, bool IsZExt = true);
   unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
                       unsigned Op1Reg, bool Op1IsKill);
   unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
                       uint64_t Imm, bool IsZExt = true);
   unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
                       unsigned Op1Reg, bool Op1IsKill);
   unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
                       uint64_t Imm, bool IsZExt = false);
 
   unsigned materializeInt(const ConstantInt *CI, MVT VT);
   unsigned materializeFP(const ConstantFP *CFP, MVT VT);
   unsigned materializeGV(const GlobalValue *GV);
 
   // Call handling routines.
 private:
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
   bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
                        unsigned &NumBytes);
   bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
 
 public:
   // Backend specific FastISel code.
   unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
   unsigned fastMaterializeConstant(const Constant *C) override;
   unsigned fastMaterializeFloatZero(const ConstantFP* CF) override;
 
   explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
                            const TargetLibraryInfo *LibInfo)
       : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
     Subtarget =
         &static_cast<const AArch64Subtarget &>(FuncInfo.MF->getSubtarget());
     Context = &FuncInfo.Fn->getContext();
   }
 
   bool fastSelectInstruction(const Instruction *I) override;
 
 #include "AArch64GenFastISel.inc"
 };
 
 } // end anonymous namespace
 
 #include "AArch64GenCallingConv.inc"
 
 /// \brief Check if the sign-/zero-extend will be a noop.
 static bool isIntExtFree(const Instruction *I) {
   assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
          "Unexpected integer extend instruction.");
   assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() &&
          "Unexpected value type.");
   bool IsZExt = isa<ZExtInst>(I);
 
   if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)))
     if (LI->hasOneUse())
       return true;
 
   if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0)))
     if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr()))
       return true;
 
   return false;
 }
 
 /// \brief Determine the implicit scale factor that is applied by a memory
 /// operation for a given value type.
 static unsigned getImplicitScaleFactor(MVT VT) {
   switch (VT.SimpleTy) {
   default:
     return 0;    // invalid
   case MVT::i1:  // fall-through
   case MVT::i8:
     return 1;
   case MVT::i16:
     return 2;
   case MVT::i32: // fall-through
   case MVT::f32:
     return 4;
   case MVT::i64: // fall-through
   case MVT::f64:
     return 8;
   }
 }
 
 CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
   if (CC == CallingConv::WebKit_JS)
     return CC_AArch64_WebKit_JS;
   if (CC == CallingConv::GHC)
     return CC_AArch64_GHC;
   return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
 }
 
 unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
   assert(TLI.getValueType(DL, AI->getType(), true) == MVT::i64 &&
          "Alloca should always return a pointer.");
 
   // Don't handle dynamic allocas.
   if (!FuncInfo.StaticAllocaMap.count(AI))
     return 0;
 
   DenseMap<const AllocaInst *, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
     unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
         .addFrameIndex(SI->second)
         .addImm(0)
         .addImm(0);
     return ResultReg;
   }
 
   return 0;
 }
 
 unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) {
   if (VT > MVT::i64)
     return 0;
 
   if (!CI->isZero())
     return fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
 
   // Create a copy from the zero register to materialize a "0" value.
   const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass
                                                    : &AArch64::GPR32RegClass;
   unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
   unsigned ResultReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
           ResultReg).addReg(ZeroReg, getKillRegState(true));
   return ResultReg;
 }
 
 unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
   // Positive zero (+0.0) has to be materialized with a fmov from the zero
   // register, because the immediate version of fmov cannot encode zero.
   if (CFP->isNullValue())
     return fastMaterializeFloatZero(CFP);
 
   if (VT != MVT::f32 && VT != MVT::f64)
     return 0;
 
   const APFloat Val = CFP->getValueAPF();
   bool Is64Bit = (VT == MVT::f64);
   // This checks to see if we can use FMOV instructions to materialize
   // a constant, otherwise we have to materialize via the constant pool.
   if (TLI.isFPImmLegal(Val, VT)) {
     int Imm =
         Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val);
     assert((Imm != -1) && "Cannot encode floating-point constant.");
     unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi;
     return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
   }
 
   // For the MachO large code model materialize the FP constant in code.
   if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
     unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
     const TargetRegisterClass *RC = Is64Bit ?
         &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
 
     unsigned TmpReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg)
         .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
 
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(TmpReg, getKillRegState(true));
 
     return ResultReg;
   }
 
   // Materialize via constant pool.  MachineConstantPool wants an explicit
   // alignment.
   unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
   if (Align == 0)
     Align = DL.getTypeAllocSize(CFP->getType());
 
   unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
   unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
           ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
 
   unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui;
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(ADRPReg)
       .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   return ResultReg;
 }
 
 unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
   // We can't handle thread-local variables quickly yet.
   if (GV->isThreadLocal())
     return 0;
 
   // MachO still uses GOT for large code-model accesses, but ELF requires
   // movz/movk sequences, which FastISel doesn't handle yet.
   if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
     return 0;
 
   unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 
   EVT DestEVT = TLI.getValueType(DL, GV->getType(), true);
   if (!DestEVT.isSimple())
     return 0;
 
   unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
   unsigned ResultReg;
 
   if (OpFlags & AArch64II::MO_GOT) {
     // ADRP + LDRX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
             ADRPReg)
-      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);
 
     ResultReg = createResultReg(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
             ResultReg)
-      .addReg(ADRPReg)
-      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
-                        AArch64II::MO_NC);
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0,
+                          AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags);
   } else {
     // ADRP + ADDX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
             ADRPReg)
-      .addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);
 
     ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
-      .addReg(ADRPReg)
-      .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
-      .addImm(0);
+        .addReg(ADRPReg)
+        .addGlobalAddress(GV, 0,
+                          AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags)
+        .addImm(0);
   }
   return ResultReg;
 }
 
 unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
   EVT CEVT = TLI.getValueType(DL, C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple())
     return 0;
   MVT VT = CEVT.getSimpleVT();
 
   if (const auto *CI = dyn_cast<ConstantInt>(C))
     return materializeInt(CI, VT);
   else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
     return materializeFP(CFP, VT);
   else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return materializeGV(GV);
 
   return 0;
 }
 
 unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) {
   assert(CFP->isNullValue() &&
          "Floating-point constant is not a positive zero.");
   MVT VT;
   if (!isTypeLegal(CFP->getType(), VT))
     return 0;
 
   if (VT != MVT::f32 && VT != MVT::f64)
     return 0;
 
   bool Is64Bit = (VT == MVT::f64);
   unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
   unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr;
   return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
 }
 
 /// \brief Check if the multiply is by a power-of-2 constant.
 static bool isMulPowOf2(const Value *I) {
   if (const auto *MI = dyn_cast<MulOperator>(I)) {
     if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0)))
       if (C->getValue().isPowerOf2())
         return true;
     if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(1)))
       if (C->getValue().isPowerOf2())
         return true;
   }
   return false;
 }
 
 // Computes the address to get to an object.
 bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
 {
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
     // Don't walk into other basic blocks unless the object is an alloca from
     // another block, otherwise it may not have a virtual register assigned.
     if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
         FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
       Opcode = I->getOpcode();
       U = I;
     }
   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
     Opcode = C->getOpcode();
     U = C;
   }
 
   if (auto *Ty = dyn_cast<PointerType>(Obj->getType()))
     if (Ty->getAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
 
   switch (Opcode) {
   default:
     break;
   case Instruction::BitCast:
     // Look through bitcasts.
     return computeAddress(U->getOperand(0), Addr, Ty);
 
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs.
     if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
         TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
 
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints.
     if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeAddress(U->getOperand(0), Addr, Ty);
     break;
 
   case Instruction::GetElementPtr: {
     Address SavedAddr = Addr;
     uint64_t TmpOffset = Addr.getOffset();
 
     // Iterate through the GEP folding the constants into offsets where
     // we can.
     for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
          GTI != E; ++GTI) {
       const Value *Op = GTI.getOperand();
       if (StructType *STy = GTI.getStructTypeOrNull()) {
         const StructLayout *SL = DL.getStructLayout(STy);
         unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
         TmpOffset += SL->getElementOffset(Idx);
       } else {
         uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
         while (true) {
           if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
             // Constant-offset addressing.
             TmpOffset += CI->getSExtValue() * S;
             break;
           }
           if (canFoldAddIntoGEP(U, Op)) {
             // A compatible add with a constant operand. Fold the constant.
             ConstantInt *CI =
                 cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
             TmpOffset += CI->getSExtValue() * S;
             // Iterate on the other operand.
             Op = cast<AddOperator>(Op)->getOperand(0);
             continue;
           }
           // Unsupported
           goto unsupported_gep;
         }
       }
     }
 
     // Try to grab the base operand now.
     Addr.setOffset(TmpOffset);
     if (computeAddress(U->getOperand(0), Addr, Ty))
       return true;
 
     // We failed, restore everything and try the other options.
     Addr = SavedAddr;
 
   unsupported_gep:
     break;
   }
   case Instruction::Alloca: {
     const AllocaInst *AI = cast<AllocaInst>(Obj);
     DenseMap<const AllocaInst *, int>::iterator SI =
         FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end()) {
       Addr.setKind(Address::FrameIndexBase);
       Addr.setFI(SI->second);
       return true;
     }
     break;
   }
   case Instruction::Add: {
     // Adds of constants are common and easy enough.
     const Value *LHS = U->getOperand(0);
     const Value *RHS = U->getOperand(1);
 
     if (isa<ConstantInt>(LHS))
       std::swap(LHS, RHS);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
       Addr.setOffset(Addr.getOffset() + CI->getSExtValue());
       return computeAddress(LHS, Addr, Ty);
     }
 
     Address Backup = Addr;
     if (computeAddress(LHS, Addr, Ty) && computeAddress(RHS, Addr, Ty))
       return true;
     Addr = Backup;
 
     break;
   }
   case Instruction::Sub: {
     // Subs of constants are common and easy enough.
     const Value *LHS = U->getOperand(0);
     const Value *RHS = U->getOperand(1);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
       Addr.setOffset(Addr.getOffset() - CI->getSExtValue());
       return computeAddress(LHS, Addr, Ty);
     }
     break;
   }
   case Instruction::Shl: {
     if (Addr.getOffsetReg())
       break;
 
     const auto *CI = dyn_cast<ConstantInt>(U->getOperand(1));
     if (!CI)
       break;
 
     unsigned Val = CI->getZExtValue();
     if (Val < 1 || Val > 3)
       break;
 
     uint64_t NumBytes = 0;
     if (Ty && Ty->isSized()) {
       uint64_t NumBits = DL.getTypeSizeInBits(Ty);
       NumBytes = NumBits / 8;
       if (!isPowerOf2_64(NumBits))
         NumBytes = 0;
     }
 
     if (NumBytes != (1ULL << Val))
       break;
 
     Addr.setShift(Val);
     Addr.setExtendType(AArch64_AM::LSL);
 
     const Value *Src = U->getOperand(0);
     if (const auto *I = dyn_cast<Instruction>(Src)) {
       if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
         // Fold the zext or sext when it won't become a noop.
         if (const auto *ZE = dyn_cast<ZExtInst>(I)) {
           if (!isIntExtFree(ZE) &&
               ZE->getOperand(0)->getType()->isIntegerTy(32)) {
             Addr.setExtendType(AArch64_AM::UXTW);
             Src = ZE->getOperand(0);
           }
         } else if (const auto *SE = dyn_cast<SExtInst>(I)) {
           if (!isIntExtFree(SE) &&
               SE->getOperand(0)->getType()->isIntegerTy(32)) {
             Addr.setExtendType(AArch64_AM::SXTW);
             Src = SE->getOperand(0);
           }
         }
       }
     }
 
     if (const auto *AI = dyn_cast<BinaryOperator>(Src))
       if (AI->getOpcode() == Instruction::And) {
         const Value *LHS = AI->getOperand(0);
         const Value *RHS = AI->getOperand(1);
 
         if (const auto *C = dyn_cast<ConstantInt>(LHS))
           if (C->getValue() == 0xffffffff)
             std::swap(LHS, RHS);
 
         if (const auto *C = dyn_cast<ConstantInt>(RHS))
           if (C->getValue() == 0xffffffff) {
             Addr.setExtendType(AArch64_AM::UXTW);
             unsigned Reg = getRegForValue(LHS);
             if (!Reg)
               return false;
             bool RegIsKill = hasTrivialKill(LHS);
             Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
                                              AArch64::sub_32);
             Addr.setOffsetReg(Reg);
             return true;
           }
       }
 
     unsigned Reg = getRegForValue(Src);
     if (!Reg)
       return false;
     Addr.setOffsetReg(Reg);
     return true;
   }
   case Instruction::Mul: {
     if (Addr.getOffsetReg())
       break;
 
     if (!isMulPowOf2(U))
       break;
 
     const Value *LHS = U->getOperand(0);
     const Value *RHS = U->getOperand(1);
 
     // Canonicalize power-of-2 value to the RHS.
     if (const auto *C = dyn_cast<ConstantInt>(LHS))
       if (C->getValue().isPowerOf2())
         std::swap(LHS, RHS);
 
     assert(isa<ConstantInt>(RHS) && "Expected an ConstantInt.");
     const auto *C = cast<ConstantInt>(RHS);
     unsigned Val = C->getValue().logBase2();
     if (Val < 1 || Val > 3)
       break;
 
     uint64_t NumBytes = 0;
     if (Ty && Ty->isSized()) {
       uint64_t NumBits = DL.getTypeSizeInBits(Ty);
       NumBytes = NumBits / 8;
       if (!isPowerOf2_64(NumBits))
         NumBytes = 0;
     }
 
     if (NumBytes != (1ULL << Val))
       break;
 
     Addr.setShift(Val);
     Addr.setExtendType(AArch64_AM::LSL);
 
     const Value *Src = LHS;
     if (const auto *I = dyn_cast<Instruction>(Src)) {
       if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
         // Fold the zext or sext when it won't become a noop.
         if (const auto *ZE = dyn_cast<ZExtInst>(I)) {
           if (!isIntExtFree(ZE) &&
               ZE->getOperand(0)->getType()->isIntegerTy(32)) {
             Addr.setExtendType(AArch64_AM::UXTW);
             Src = ZE->getOperand(0);
           }
         } else if (const auto *SE = dyn_cast<SExtInst>(I)) {
           if (!isIntExtFree(SE) &&
               SE->getOperand(0)->getType()->isIntegerTy(32)) {
             Addr.setExtendType(AArch64_AM::SXTW);
             Src = SE->getOperand(0);
           }
         }
       }
     }
 
     unsigned Reg = getRegForValue(Src);
     if (!Reg)
       return false;
     Addr.setOffsetReg(Reg);
     return true;
   }
   case Instruction::And: {
     if (Addr.getOffsetReg())
       break;
 
     if (!Ty || DL.getTypeSizeInBits(Ty) != 8)
       break;
 
     const Value *LHS = U->getOperand(0);
     const Value *RHS = U->getOperand(1);
 
     if (const auto *C = dyn_cast<ConstantInt>(LHS))
       if (C->getValue() == 0xffffffff)
         std::swap(LHS, RHS);
 
     if (const auto *C = dyn_cast<ConstantInt>(RHS))
       if (C->getValue() == 0xffffffff) {
         Addr.setShift(0);
         Addr.setExtendType(AArch64_AM::LSL);
         Addr.setExtendType(AArch64_AM::UXTW);
 
         unsigned Reg = getRegForValue(LHS);
         if (!Reg)
           return false;
         bool RegIsKill = hasTrivialKill(LHS);
         Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
                                          AArch64::sub_32);
         Addr.setOffsetReg(Reg);
         return true;
       }
     break;
   }
   case Instruction::SExt:
   case Instruction::ZExt: {
     if (!Addr.getReg() || Addr.getOffsetReg())
       break;
 
     const Value *Src = nullptr;
     // Fold the zext or sext when it won't become a noop.
     if (const auto *ZE = dyn_cast<ZExtInst>(U)) {
       if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
         Addr.setExtendType(AArch64_AM::UXTW);
         Src = ZE->getOperand(0);
       }
     } else if (const auto *SE = dyn_cast<SExtInst>(U)) {
       if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
         Addr.setExtendType(AArch64_AM::SXTW);
         Src = SE->getOperand(0);
       }
     }
 
     if (!Src)
       break;
 
     Addr.setShift(0);
     unsigned Reg = getRegForValue(Src);
     if (!Reg)
       return false;
     Addr.setOffsetReg(Reg);
     return true;
   }
   } // end switch
 
   if (Addr.isRegBase() && !Addr.getReg()) {
     unsigned Reg = getRegForValue(Obj);
     if (!Reg)
       return false;
     Addr.setReg(Reg);
     return true;
   }
 
   if (!Addr.getOffsetReg()) {
     unsigned Reg = getRegForValue(Obj);
     if (!Reg)
       return false;
     Addr.setOffsetReg(Reg);
     return true;
   }
 
   return false;
 }
 
 bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   bool InMBB = true;
 
   if (const auto *I = dyn_cast<Instruction>(V)) {
     Opcode = I->getOpcode();
     U = I;
     InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
   } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
     Opcode = C->getOpcode();
     U = C;
   }
 
   switch (Opcode) {
   default: break;
   case Instruction::BitCast:
     // Look past bitcasts if its operand is in the same BB.
     if (InMBB)
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs if its operand is in the same BB.
     if (InMBB &&
         TLI.getValueType(DL, U->getOperand(0)->getType()) ==
             TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints if its operand is in the same BB.
     if (InMBB && TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
       return computeCallAddress(U->getOperand(0), Addr);
     break;
   }
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     Addr.setGlobalValue(GV);
     return true;
   }
 
   // If all else fails, try to materialize the value in a register.
   if (!Addr.getGlobalValue()) {
     Addr.setReg(getRegForValue(V));
     return Addr.getReg() != 0;
   }
 
   return false;
 }
 
 bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(DL, Ty, true);
 
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple())
     return false;
   VT = evt.getSimpleVT();
 
   // This is a legal type, but it's not something we handle in fast-isel.
   if (VT == MVT::f128)
     return false;
 
   // Handle all other legal types, i.e. a register that will directly hold this
   // value.
   return TLI.isTypeLegal(VT);
 }
 
 /// \brief Determine if the value type is supported by FastISel.
 ///
 /// FastISel for AArch64 can handle more value types than are legal. This adds
 /// simple value type such as i1, i8, and i16.
 bool AArch64FastISel::isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed) {
   if (Ty->isVectorTy() && !IsVectorAllowed)
     return false;
 
   if (isTypeLegal(Ty, VT))
     return true;
 
   // If this is a type than can be sign or zero-extended to a basic operation
   // go ahead and accept it now.
   if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
     return true;
 
   return false;
 }
 
 bool AArch64FastISel::isValueAvailable(const Value *V) const {
   if (!isa<Instruction>(V))
     return true;
 
   const auto *I = cast<Instruction>(V);
   return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
 }
 
 bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
   unsigned ScaleFactor = getImplicitScaleFactor(VT);
   if (!ScaleFactor)
     return false;
 
   bool ImmediateOffsetNeedsLowering = false;
   bool RegisterOffsetNeedsLowering = false;
   int64_t Offset = Addr.getOffset();
   if (((Offset < 0) || (Offset & (ScaleFactor - 1))) && !isInt<9>(Offset))
     ImmediateOffsetNeedsLowering = true;
   else if (Offset > 0 && !(Offset & (ScaleFactor - 1)) &&
            !isUInt<12>(Offset / ScaleFactor))
     ImmediateOffsetNeedsLowering = true;
 
   // Cannot encode an offset register and an immediate offset in the same
   // instruction. Fold the immediate offset into the load/store instruction and
   // emit an additional add to take care of the offset register.
   if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
     RegisterOffsetNeedsLowering = true;
 
   // Cannot encode zero register as base.
   if (Addr.isRegBase() && Addr.getOffsetReg() && !Addr.getReg())
     RegisterOffsetNeedsLowering = true;
 
   // If this is a stack pointer and the offset needs to be simplified then put
   // the alloca address into a register, set the base type back to register and
   // continue. This should almost never happen.
   if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase())
   {
     unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
       .addFrameIndex(Addr.getFI())
       .addImm(0)
       .addImm(0);
     Addr.setKind(Address::RegBase);
     Addr.setReg(ResultReg);
   }
 
   if (RegisterOffsetNeedsLowering) {
     unsigned ResultReg = 0;
     if (Addr.getReg()) {
       if (Addr.getExtendType() == AArch64_AM::SXTW ||
           Addr.getExtendType() == AArch64_AM::UXTW   )
         ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
                                   /*TODO:IsKill=*/false, Addr.getOffsetReg(),
                                   /*TODO:IsKill=*/false, Addr.getExtendType(),
                                   Addr.getShift());
       else
         ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
                                   /*TODO:IsKill=*/false, Addr.getOffsetReg(),
                                   /*TODO:IsKill=*/false, AArch64_AM::LSL,
                                   Addr.getShift());
     } else {
       if (Addr.getExtendType() == AArch64_AM::UXTW)
         ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
                                /*Op0IsKill=*/false, Addr.getShift(),
                                /*IsZExt=*/true);
       else if (Addr.getExtendType() == AArch64_AM::SXTW)
         ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
                                /*Op0IsKill=*/false, Addr.getShift(),
                                /*IsZExt=*/false);
       else
         ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(),
                                /*Op0IsKill=*/false, Addr.getShift());
     }
     if (!ResultReg)
       return false;
 
     Addr.setReg(ResultReg);
     Addr.setOffsetReg(0);
     Addr.setShift(0);
     Addr.setExtendType(AArch64_AM::InvalidShiftExtend);
   }
 
   // Since the offset is too large for the load/store instruction get the
   // reg+offset into a register.
   if (ImmediateOffsetNeedsLowering) {
     unsigned ResultReg;
     if (Addr.getReg())
       // Try to fold the immediate into the add instruction.
       ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset);
     else
       ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset);
 
     if (!ResultReg)
       return false;
     Addr.setReg(ResultReg);
     Addr.setOffset(0);
   }
   return true;
 }
 
 void AArch64FastISel::addLoadStoreOperands(Address &Addr,
                                            const MachineInstrBuilder &MIB,
                                            MachineMemOperand::Flags Flags,
                                            unsigned ScaleFactor,
                                            MachineMemOperand *MMO) {
   int64_t Offset = Addr.getOffset() / ScaleFactor;
   // Frame base works a bit differently. Handle it separately.
   if (Addr.isFIBase()) {
     int FI = Addr.getFI();
     // FIXME: We shouldn't be using getObjectSize/getObjectAlignment.  The size
     // and alignment should be based on the VT.
     MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
         MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
     // Now add the rest of the operands.
     MIB.addFrameIndex(FI).addImm(Offset);
   } else {
     assert(Addr.isRegBase() && "Unexpected address kind.");
     const MCInstrDesc &II = MIB->getDesc();
     unsigned Idx = (Flags & MachineMemOperand::MOStore) ? 1 : 0;
     Addr.setReg(
       constrainOperandRegClass(II, Addr.getReg(), II.getNumDefs()+Idx));
     Addr.setOffsetReg(
       constrainOperandRegClass(II, Addr.getOffsetReg(), II.getNumDefs()+Idx+1));
     if (Addr.getOffsetReg()) {
       assert(Addr.getOffset() == 0 && "Unexpected offset");
       bool IsSigned = Addr.getExtendType() == AArch64_AM::SXTW ||
                       Addr.getExtendType() == AArch64_AM::SXTX;
       MIB.addReg(Addr.getReg());
       MIB.addReg(Addr.getOffsetReg());
       MIB.addImm(IsSigned);
       MIB.addImm(Addr.getShift() != 0);
     } else
       MIB.addReg(Addr.getReg()).addImm(Offset);
   }
 
   if (MMO)
     MIB.addMemOperand(MMO);
 }
 
 unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
                                      const Value *RHS, bool SetFlags,
                                      bool WantResult,  bool IsZExt) {
   AArch64_AM::ShiftExtendType ExtendType = AArch64_AM::InvalidShiftExtend;
   bool NeedExtend = false;
   switch (RetVT.SimpleTy) {
   default:
     return 0;
   case MVT::i1:
     NeedExtend = true;
     break;
   case MVT::i8:
     NeedExtend = true;
     ExtendType = IsZExt ? AArch64_AM::UXTB : AArch64_AM::SXTB;
     break;
   case MVT::i16:
     NeedExtend = true;
     ExtendType = IsZExt ? AArch64_AM::UXTH : AArch64_AM::SXTH;
     break;
   case MVT::i32:  // fall-through
   case MVT::i64:
     break;
   }
   MVT SrcVT = RetVT;
   RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32);
 
   // Canonicalize immediates to the RHS first.
   if (UseAdd && isa<Constant>(LHS) && !isa<Constant>(RHS))
     std::swap(LHS, RHS);
 
   // Canonicalize mul by power of 2 to the RHS.
   if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
     if (isMulPowOf2(LHS))
       std::swap(LHS, RHS);
 
   // Canonicalize shift immediate to the RHS.
   if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
     if (const auto *SI = dyn_cast<BinaryOperator>(LHS))
       if (isa<ConstantInt>(SI->getOperand(1)))
         if (SI->getOpcode() == Instruction::Shl  ||
             SI->getOpcode() == Instruction::LShr ||
             SI->getOpcode() == Instruction::AShr   )
           std::swap(LHS, RHS);
 
   unsigned LHSReg = getRegForValue(LHS);
   if (!LHSReg)
     return 0;
   bool LHSIsKill = hasTrivialKill(LHS);
 
   if (NeedExtend)
     LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt);
 
   unsigned ResultReg = 0;
   if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
     uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue();
     if (C->isNegative())
       ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm,
                                 SetFlags, WantResult);
     else
       ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags,
                                 WantResult);
   } else if (const auto *C = dyn_cast<Constant>(RHS))
     if (C->isNullValue())
       ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags,
                                 WantResult);
 
   if (ResultReg)
     return ResultReg;
 
   // Only extend the RHS within the instruction if there is a valid extend type.
   if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&
       isValueAvailable(RHS)) {
     if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
       if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
         if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
           unsigned RHSReg = getRegForValue(SI->getOperand(0));
           if (!RHSReg)
             return 0;
           bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
           return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
                                RHSIsKill, ExtendType, C->getZExtValue(),
                                SetFlags, WantResult);
         }
     unsigned RHSReg = getRegForValue(RHS);
     if (!RHSReg)
       return 0;
     bool RHSIsKill = hasTrivialKill(RHS);
     return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
                          ExtendType, 0, SetFlags, WantResult);
   }
 
   // Check if the mul can be folded into the instruction.
   if (RHS->hasOneUse() && isValueAvailable(RHS)) {
     if (isMulPowOf2(RHS)) {
       const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
       const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
 
       if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
         if (C->getValue().isPowerOf2())
           std::swap(MulLHS, MulRHS);
 
       assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
       uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
       unsigned RHSReg = getRegForValue(MulLHS);
       if (!RHSReg)
         return 0;
       bool RHSIsKill = hasTrivialKill(MulLHS);
       ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
                                 RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags,
                                 WantResult);
       if (ResultReg)
         return ResultReg;
     }
   }
 
   // Check if the shift can be folded into the instruction.
   if (RHS->hasOneUse() && isValueAvailable(RHS)) {
     if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
       if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
         AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
         switch (SI->getOpcode()) {
         default: break;
         case Instruction::Shl:  ShiftType = AArch64_AM::LSL; break;
         case Instruction::LShr: ShiftType = AArch64_AM::LSR; break;
         case Instruction::AShr: ShiftType = AArch64_AM::ASR; break;
         }
         uint64_t ShiftVal = C->getZExtValue();
         if (ShiftType != AArch64_AM::InvalidShiftExtend) {
           unsigned RHSReg = getRegForValue(SI->getOperand(0));
           if (!RHSReg)
             return 0;
           bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
           ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
                                     RHSIsKill, ShiftType, ShiftVal, SetFlags,
                                     WantResult);
           if (ResultReg)
             return ResultReg;
         }
       }
     }
   }
 
   unsigned RHSReg = getRegForValue(RHS);
   if (!RHSReg)
     return 0;
   bool RHSIsKill = hasTrivialKill(RHS);
 
   if (NeedExtend)
     RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt);
 
   return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
                        SetFlags, WantResult);
 }
 
 unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
                                         bool LHSIsKill, unsigned RHSReg,
                                         bool RHSIsKill, bool SetFlags,
                                         bool WantResult) {
   assert(LHSReg && RHSReg && "Invalid register number.");
 
   if (LHSReg == AArch64::SP || LHSReg == AArch64::WSP ||
       RHSReg == AArch64::SP || RHSReg == AArch64::WSP)
     return 0;
 
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
 
   static const unsigned OpcTable[2][2][2] = {
     { { AArch64::SUBWrr,  AArch64::SUBXrr  },
       { AArch64::ADDWrr,  AArch64::ADDXrr  }  },
     { { AArch64::SUBSWrr, AArch64::SUBSXrr },
       { AArch64::ADDSWrr, AArch64::ADDSXrr }  }
   };
   bool Is64Bit = RetVT == MVT::i64;
   unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
   const TargetRegisterClass *RC =
       Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   unsigned ResultReg;
   if (WantResult)
     ResultReg = createResultReg(RC);
   else
     ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
 
   const MCInstrDesc &II = TII.get(Opc);
   LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
   RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(LHSReg, getKillRegState(LHSIsKill))
       .addReg(RHSReg, getKillRegState(RHSIsKill));
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
                                         bool LHSIsKill, uint64_t Imm,
                                         bool SetFlags, bool WantResult) {
   assert(LHSReg && "Invalid register number.");
 
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
 
   unsigned ShiftImm;
   if (isUInt<12>(Imm))
     ShiftImm = 0;
   else if ((Imm & 0xfff000) == Imm) {
     ShiftImm = 12;
     Imm >>= 12;
   } else
     return 0;
 
   static const unsigned OpcTable[2][2][2] = {
     { { AArch64::SUBWri,  AArch64::SUBXri  },
       { AArch64::ADDWri,  AArch64::ADDXri  }  },
     { { AArch64::SUBSWri, AArch64::SUBSXri },
       { AArch64::ADDSWri, AArch64::ADDSXri }  }
   };
   bool Is64Bit = RetVT == MVT::i64;
   unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
   const TargetRegisterClass *RC;
   if (SetFlags)
     RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   else
     RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
   unsigned ResultReg;
   if (WantResult)
     ResultReg = createResultReg(RC);
   else
     ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
 
   const MCInstrDesc &II = TII.get(Opc);
   LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(LHSReg, getKillRegState(LHSIsKill))
       .addImm(Imm)
       .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm));
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
                                         bool LHSIsKill, unsigned RHSReg,
                                         bool RHSIsKill,
                                         AArch64_AM::ShiftExtendType ShiftType,
                                         uint64_t ShiftImm, bool SetFlags,
                                         bool WantResult) {
   assert(LHSReg && RHSReg && "Invalid register number.");
   assert(LHSReg != AArch64::SP && LHSReg != AArch64::WSP &&
          RHSReg != AArch64::SP && RHSReg != AArch64::WSP);
 
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
 
   // Don't deal with undefined shifts.
   if (ShiftImm >= RetVT.getSizeInBits())
     return 0;
 
   static const unsigned OpcTable[2][2][2] = {
     { { AArch64::SUBWrs,  AArch64::SUBXrs  },
       { AArch64::ADDWrs,  AArch64::ADDXrs  }  },
     { { AArch64::SUBSWrs, AArch64::SUBSXrs },
       { AArch64::ADDSWrs, AArch64::ADDSXrs }  }
   };
   bool Is64Bit = RetVT == MVT::i64;
   unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
   const TargetRegisterClass *RC =
       Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   unsigned ResultReg;
   if (WantResult)
     ResultReg = createResultReg(RC);
   else
     ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
 
   const MCInstrDesc &II = TII.get(Opc);
   LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
   RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(LHSReg, getKillRegState(LHSIsKill))
       .addReg(RHSReg, getKillRegState(RHSIsKill))
       .addImm(getShifterImm(ShiftType, ShiftImm));
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
                                         bool LHSIsKill, unsigned RHSReg,
                                         bool RHSIsKill,
                                         AArch64_AM::ShiftExtendType ExtType,
                                         uint64_t ShiftImm, bool SetFlags,
                                         bool WantResult) {
   assert(LHSReg && RHSReg && "Invalid register number.");
   assert(LHSReg != AArch64::XZR && LHSReg != AArch64::WZR &&
          RHSReg != AArch64::XZR && RHSReg != AArch64::WZR);
 
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return 0;
 
   if (ShiftImm >= 4)
     return 0;
 
   static const unsigned OpcTable[2][2][2] = {
     { { AArch64::SUBWrx,  AArch64::SUBXrx  },
       { AArch64::ADDWrx,  AArch64::ADDXrx  }  },
     { { AArch64::SUBSWrx, AArch64::SUBSXrx },
       { AArch64::ADDSWrx, AArch64::ADDSXrx }  }
   };
   bool Is64Bit = RetVT == MVT::i64;
   unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
   const TargetRegisterClass *RC = nullptr;
   if (SetFlags)
     RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   else
     RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
   unsigned ResultReg;
   if (WantResult)
     ResultReg = createResultReg(RC);
   else
     ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
 
   const MCInstrDesc &II = TII.get(Opc);
   LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
   RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
       .addReg(LHSReg, getKillRegState(LHSIsKill))
       .addReg(RHSReg, getKillRegState(RHSIsKill))
       .addImm(getArithExtendImm(ExtType, ShiftImm));
   return ResultReg;
 }
 
 bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) {
   Type *Ty = LHS->getType();
   EVT EVT = TLI.getValueType(DL, Ty, true);
   if (!EVT.isSimple())
     return false;
   MVT VT = EVT.getSimpleVT();
 
   switch (VT.SimpleTy) {
   default:
     return false;
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
   case MVT::i64:
     return emitICmp(VT, LHS, RHS, IsZExt);
   case MVT::f32:
   case MVT::f64:
     return emitFCmp(VT, LHS, RHS);
   }
 }
 
 bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS,
                                bool IsZExt) {
   return emitSub(RetVT, LHS, RHS, /*SetFlags=*/true, /*WantResult=*/false,
                  IsZExt) != 0;
 }
 
 bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
                                   uint64_t Imm) {
   return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm,
                        /*SetFlags=*/true, /*WantResult=*/false) != 0;
 }
 
 bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) {
   if (RetVT != MVT::f32 && RetVT != MVT::f64)
     return false;
 
   // Check to see if the 2nd operand is a constant that we can encode directly
   // in the compare.
   bool UseImm = false;
   if (const auto *CFP = dyn_cast<ConstantFP>(RHS))
     if (CFP->isZero() && !CFP->isNegative())
       UseImm = true;
 
   unsigned LHSReg = getRegForValue(LHS);
   if (!LHSReg)
     return false;
   bool LHSIsKill = hasTrivialKill(LHS);
 
   if (UseImm) {
     unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri;
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
         .addReg(LHSReg, getKillRegState(LHSIsKill));
     return true;
   }
 
   unsigned RHSReg = getRegForValue(RHS);
   if (!RHSReg)
     return false;
   bool RHSIsKill = hasTrivialKill(RHS);
 
   unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
       .addReg(LHSReg, getKillRegState(LHSIsKill))
       .addReg(RHSReg, getKillRegState(RHSIsKill));
   return true;
 }
 
 unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
                                   bool SetFlags, bool WantResult, bool IsZExt) {
   return emitAddSub(/*UseAdd=*/true, RetVT, LHS, RHS, SetFlags, WantResult,
                     IsZExt);
 }
 
 /// \brief This method is a wrapper to simplify add emission.
 ///
 /// First try to emit an add with an immediate operand using emitAddSub_ri. If
 /// that fails, then try to materialize the immediate into a register and use
 /// emitAddSub_rr instead.
 unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill,
                                       int64_t Imm) {
   unsigned ResultReg;
   if (Imm < 0)
     ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm);
   else
     ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm);
 
   if (ResultReg)
     return ResultReg;
 
   unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm);
   if (!CReg)
     return 0;
 
   ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true);
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
                                   bool SetFlags, bool WantResult, bool IsZExt) {
   return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult,
                     IsZExt);
 }
 
 unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg,
                                       bool LHSIsKill, unsigned RHSReg,
                                       bool RHSIsKill, bool WantResult) {
   return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
                        RHSIsKill, /*SetFlags=*/true, WantResult);
 }
 
 unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg,
                                       bool LHSIsKill, unsigned RHSReg,
                                       bool RHSIsKill,
                                       AArch64_AM::ShiftExtendType ShiftType,
                                       uint64_t ShiftImm, bool WantResult) {
   return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
                        RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true,
                        WantResult);
 }
 
 unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
                                         const Value *LHS, const Value *RHS) {
   // Canonicalize immediates to the RHS first.
   if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS))
     std::swap(LHS, RHS);
 
   // Canonicalize mul by power-of-2 to the RHS.
   if (LHS->hasOneUse() && isValueAvailable(LHS))
     if (isMulPowOf2(LHS))
       std::swap(LHS, RHS);
 
   // Canonicalize shift immediate to the RHS.
   if (LHS->hasOneUse() && isValueAvailable(LHS))
     if (const auto *SI = dyn_cast<ShlOperator>(LHS))
       if (isa<ConstantInt>(SI->getOperand(1)))
         std::swap(LHS, RHS);
 
   unsigned LHSReg = getRegForValue(LHS);
   if (!LHSReg)
     return 0;
   bool LHSIsKill = hasTrivialKill(LHS);
 
   unsigned ResultReg = 0;
   if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
     uint64_t Imm = C->getZExtValue();
     ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm);
   }
   if (ResultReg)
     return ResultReg;
 
   // Check if the mul can be folded into the instruction.
   if (RHS->hasOneUse() && isValueAvailable(RHS)) {
     if (isMulPowOf2(RHS)) {
       const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
       const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
 
       if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
         if (C->getValue().isPowerOf2())
           std::swap(MulLHS, MulRHS);
 
       assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
       uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
 
       unsigned RHSReg = getRegForValue(MulLHS);
       if (!RHSReg)
         return 0;
       bool RHSIsKill = hasTrivialKill(MulLHS);
       ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
                                    RHSIsKill, ShiftVal);
       if (ResultReg)
         return ResultReg;
     }
   }
 
   // Check if the shift can be folded into the instruction.
   if (RHS->hasOneUse() && isValueAvailable(RHS)) {
     if (const auto *SI = dyn_cast<ShlOperator>(RHS))
       if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
         uint64_t ShiftVal = C->getZExtValue();
         unsigned RHSReg = getRegForValue(SI->getOperand(0));
         if (!RHSReg)
           return 0;
         bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
         ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
                                      RHSIsKill, ShiftVal);
         if (ResultReg)
           return ResultReg;
       }
   }
 
   unsigned RHSReg = getRegForValue(RHS);
   if (!RHSReg)
     return 0;
   bool RHSIsKill = hasTrivialKill(RHS);
 
   MVT VT = std::max(MVT::i32, RetVT.SimpleTy);
   ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
   if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
     uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
   }
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
                                            unsigned LHSReg, bool LHSIsKill,
                                            uint64_t Imm) {
   static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
                 "ISD nodes are not consecutive!");
   static const unsigned OpcTable[3][2] = {
     { AArch64::ANDWri, AArch64::ANDXri },
     { AArch64::ORRWri, AArch64::ORRXri },
     { AArch64::EORWri, AArch64::EORXri }
   };
   const TargetRegisterClass *RC;
   unsigned Opc;
   unsigned RegSize;
   switch (RetVT.SimpleTy) {
   default:
     return 0;
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
   case MVT::i32: {
     unsigned Idx = ISDOpc - ISD::AND;
     Opc = OpcTable[Idx][0];
     RC = &AArch64::GPR32spRegClass;
     RegSize = 32;
     break;
   }
   case MVT::i64:
     Opc = OpcTable[ISDOpc - ISD::AND][1];
     RC = &AArch64::GPR64spRegClass;
     RegSize = 64;
     break;
   }
 
   if (!AArch64_AM::isLogicalImmediate(Imm, RegSize))
     return 0;
 
   unsigned ResultReg =
       fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill,
                       AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
   if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) {
     uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
   }
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
                                            unsigned LHSReg, bool LHSIsKill,
                                            unsigned RHSReg, bool RHSIsKill,
                                            uint64_t ShiftImm) {
   static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
                 "ISD nodes are not consecutive!");
   static const unsigned OpcTable[3][2] = {
     { AArch64::ANDWrs, AArch64::ANDXrs },
     { AArch64::ORRWrs, AArch64::ORRXrs },
     { AArch64::EORWrs, AArch64::EORXrs }
   };
 
   // Don't deal with undefined shifts.
   if (ShiftImm >= RetVT.getSizeInBits())
     return 0;
 
   const TargetRegisterClass *RC;
   unsigned Opc;
   switch (RetVT.SimpleTy) {
   default:
     return 0;
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
     Opc = OpcTable[ISDOpc - ISD::AND][0];
     RC = &AArch64::GPR32RegClass;
     break;
   case MVT::i64:
     Opc = OpcTable[ISDOpc - ISD::AND][1];
     RC = &AArch64::GPR64RegClass;
     break;
   }
   unsigned ResultReg =
       fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
                        AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm));
   if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
     uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
   }
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
                                      uint64_t Imm) {
   return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm);
 }
 
 unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
                                    bool WantZExt, MachineMemOperand *MMO) {
   if (!TLI.allowsMisalignedMemoryAccesses(VT))
     return 0;
 
   // Simplify this down to something we can handle.
   if (!simplifyAddress(Addr, VT))
     return 0;
 
   unsigned ScaleFactor = getImplicitScaleFactor(VT);
   if (!ScaleFactor)
     llvm_unreachable("Unexpected value type.");
 
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
   bool UseScaled = true;
   if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
     UseScaled = false;
     ScaleFactor = 1;
   }
 
   static const unsigned GPOpcTable[2][8][4] = {
     // Sign-extend.
     { { AArch64::LDURSBWi,  AArch64::LDURSHWi,  AArch64::LDURWi,
         AArch64::LDURXi  },
       { AArch64::LDURSBXi,  AArch64::LDURSHXi,  AArch64::LDURSWi,
         AArch64::LDURXi  },
       { AArch64::LDRSBWui,  AArch64::LDRSHWui,  AArch64::LDRWui,
         AArch64::LDRXui  },
       { AArch64::LDRSBXui,  AArch64::LDRSHXui,  AArch64::LDRSWui,
         AArch64::LDRXui  },
       { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX,
         AArch64::LDRXroX },
       { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX,
         AArch64::LDRXroX },
       { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW,
         AArch64::LDRXroW },
       { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW,
         AArch64::LDRXroW }
     },
     // Zero-extend.
     { { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
         AArch64::LDURXi  },
       { AArch64::LDURBBi,   AArch64::LDURHHi,   AArch64::LDURWi,
         AArch64::LDURXi  },
       { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
         AArch64::LDRXui  },
       { AArch64::LDRBBui,   AArch64::LDRHHui,   AArch64::LDRWui,
         AArch64::LDRXui  },
       { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
         AArch64::LDRXroX },
       { AArch64::LDRBBroX,  AArch64::LDRHHroX,  AArch64::LDRWroX,
         AArch64::LDRXroX },
       { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
         AArch64::LDRXroW },
       { AArch64::LDRBBroW,  AArch64::LDRHHroW,  AArch64::LDRWroW,
         AArch64::LDRXroW }
     }
   };
 
   static const unsigned FPOpcTable[4][2] = {
     { AArch64::LDURSi,  AArch64::LDURDi  },
     { AArch64::LDRSui,  AArch64::LDRDui  },
     { AArch64::LDRSroX, AArch64::LDRDroX },
     { AArch64::LDRSroW, AArch64::LDRDroW }
   };
 
   unsigned Opc;
   const TargetRegisterClass *RC;
   bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
                       Addr.getOffsetReg();
   unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
   if (Addr.getExtendType() == AArch64_AM::UXTW ||
       Addr.getExtendType() == AArch64_AM::SXTW)
     Idx++;
 
   bool IsRet64Bit = RetVT == MVT::i64;
   switch (VT.SimpleTy) {
   default:
     llvm_unreachable("Unexpected value type.");
   case MVT::i1: // Intentional fall-through.
   case MVT::i8:
     Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0];
     RC = (IsRet64Bit && !WantZExt) ?
              &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i16:
     Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1];
     RC = (IsRet64Bit && !WantZExt) ?
              &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i32:
     Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2];
     RC = (IsRet64Bit && !WantZExt) ?
              &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
     break;
   case MVT::i64:
     Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3];
     RC = &AArch64::GPR64RegClass;
     break;
   case MVT::f32:
     Opc = FPOpcTable[Idx][0];
     RC = &AArch64::FPR32RegClass;
     break;
   case MVT::f64:
     Opc = FPOpcTable[Idx][1];
     RC = &AArch64::FPR64RegClass;
     break;
   }
 
   // Create the base instruction, then add the operands.
   unsigned ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(Opc), ResultReg);
   addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO);
 
   // Loading an i1 requires special handling.
   if (VT == MVT::i1) {
     unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1);
     assert(ANDReg && "Unexpected AND instruction emission failure.");
     ResultReg = ANDReg;
   }
 
   // For zero-extending loads to 64bit we emit a 32bit load and then convert
   // the 32bit reg to a 64bit reg.
   if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) {
     unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), Reg64)
         .addImm(0)
         .addReg(ResultReg, getKillRegState(true))
         .addImm(AArch64::sub_32);
     ResultReg = Reg64;
   }
   return ResultReg;
 }
 
 bool AArch64FastISel::selectAddSub(const Instruction *I) {
   MVT VT;
   if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
     return false;
 
   if (VT.isVector())
     return selectOperator(I, I->getOpcode());
 
   unsigned ResultReg;
   switch (I->getOpcode()) {
   default:
     llvm_unreachable("Unexpected instruction.");
   case Instruction::Add:
     ResultReg = emitAdd(VT, I->getOperand(0), I->getOperand(1));
     break;
   case Instruction::Sub:
     ResultReg = emitSub(VT, I->getOperand(0), I->getOperand(1));
     break;
   }
   if (!ResultReg)
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectLogicalOp(const Instruction *I) {
   MVT VT;
   if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
     return false;
 
   if (VT.isVector())
     return selectOperator(I, I->getOpcode());
 
   unsigned ResultReg;
   switch (I->getOpcode()) {
   default:
     llvm_unreachable("Unexpected instruction.");
   case Instruction::And:
     ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1));
     break;
   case Instruction::Or:
     ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1));
     break;
   case Instruction::Xor:
     ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1));
     break;
   }
   if (!ResultReg)
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectLoad(const Instruction *I) {
   MVT VT;
   // Verify we have a legal type before going any further.  Currently, we handle
   // simple types that will directly fit in a register (i32/f32/i64/f64) or
   // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
   if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true) ||
       cast<LoadInst>(I)->isAtomic())
     return false;
 
   const Value *SV = I->getOperand(0);
   if (TLI.supportSwiftError()) {
     // Swifterror values can come from either a function parameter with
     // swifterror attribute or an alloca with swifterror attribute.
     if (const Argument *Arg = dyn_cast<Argument>(SV)) {
       if (Arg->hasSwiftErrorAttr())
         return false;
     }
 
     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
       if (Alloca->isSwiftError())
         return false;
     }
   }
 
   // See if we can handle this address.
   Address Addr;
   if (!computeAddress(I->getOperand(0), Addr, I->getType()))
     return false;
 
   // Fold the following sign-/zero-extend into the load instruction.
   bool WantZExt = true;
   MVT RetVT = VT;
   const Value *IntExtVal = nullptr;
   if (I->hasOneUse()) {
     if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) {
       if (isTypeSupported(ZE->getType(), RetVT))
         IntExtVal = ZE;
       else
         RetVT = VT;
     } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) {
       if (isTypeSupported(SE->getType(), RetVT))
         IntExtVal = SE;
       else
         RetVT = VT;
       WantZExt = false;
     }
   }
 
   unsigned ResultReg =
       emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I));
   if (!ResultReg)
     return false;
 
   // There are a few different cases we have to handle, because the load or the
   // sign-/zero-extend might not be selected by FastISel if we fall-back to
   // SelectionDAG. There is also an ordering issue when both instructions are in
   // different basic blocks.
   // 1.) The load instruction is selected by FastISel, but the integer extend
   //     not. This usually happens when the integer extend is in a different
   //     basic block and SelectionDAG took over for that basic block.
   // 2.) The load instruction is selected before the integer extend. This only
   //     happens when the integer extend is in a different basic block.
   // 3.) The load instruction is selected by SelectionDAG and the integer extend
   //     by FastISel. This happens if there are instructions between the load
   //     and the integer extend that couldn't be selected by FastISel.
   if (IntExtVal) {
     // The integer extend hasn't been emitted yet. FastISel or SelectionDAG
     // could select it. Emit a copy to subreg if necessary. FastISel will remove
     // it when it selects the integer extend.
     unsigned Reg = lookUpRegForValue(IntExtVal);
     auto *MI = MRI.getUniqueVRegDef(Reg);
     if (!MI) {
       if (RetVT == MVT::i64 && VT <= MVT::i32) {
         if (WantZExt) {
           // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG).
           std::prev(FuncInfo.InsertPt)->eraseFromParent();
           ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg();
         } else
           ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg,
                                                  /*IsKill=*/true,
                                                  AArch64::sub_32);
       }
       updateValueMap(I, ResultReg);
       return true;
     }
 
     // The integer extend has already been emitted - delete all the instructions
     // that have been emitted by the integer extend lowering code and use the
     // result from the load instruction directly.
     while (MI) {
       Reg = 0;
       for (auto &Opnd : MI->uses()) {
         if (Opnd.isReg()) {
           Reg = Opnd.getReg();
           break;
         }
       }
       MI->eraseFromParent();
       MI = nullptr;
       if (Reg)
         MI = MRI.getUniqueVRegDef(Reg);
     }
     updateValueMap(IntExtVal, ResultReg);
     return true;
   }
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::emitStoreRelease(MVT VT, unsigned SrcReg,
                                        unsigned AddrReg,
                                        MachineMemOperand *MMO) {
   unsigned Opc;
   switch (VT.SimpleTy) {
   default: return false;
   case MVT::i8:  Opc = AArch64::STLRB; break;
   case MVT::i16: Opc = AArch64::STLRH; break;
   case MVT::i32: Opc = AArch64::STLRW; break;
   case MVT::i64: Opc = AArch64::STLRX; break;
   }
 
   const MCInstrDesc &II = TII.get(Opc);
   SrcReg = constrainOperandRegClass(II, SrcReg, 0);
   AddrReg = constrainOperandRegClass(II, AddrReg, 1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
       .addReg(SrcReg)
       .addReg(AddrReg)
       .addMemOperand(MMO);
   return true;
 }
 
 bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr,
                                 MachineMemOperand *MMO) {
   if (!TLI.allowsMisalignedMemoryAccesses(VT))
     return false;
 
   // Simplify this down to something we can handle.
   if (!simplifyAddress(Addr, VT))
     return false;
 
   unsigned ScaleFactor = getImplicitScaleFactor(VT);
   if (!ScaleFactor)
     llvm_unreachable("Unexpected value type.");
 
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
   bool UseScaled = true;
   if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
     UseScaled = false;
     ScaleFactor = 1;
   }
 
   static const unsigned OpcTable[4][6] = {
     { AArch64::STURBBi,  AArch64::STURHHi,  AArch64::STURWi,  AArch64::STURXi,
       AArch64::STURSi,   AArch64::STURDi },
     { AArch64::STRBBui,  AArch64::STRHHui,  AArch64::STRWui,  AArch64::STRXui,
       AArch64::STRSui,   AArch64::STRDui },
     { AArch64::STRBBroX, AArch64::STRHHroX, AArch64::STRWroX, AArch64::STRXroX,
       AArch64::STRSroX,  AArch64::STRDroX },
     { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW,
       AArch64::STRSroW,  AArch64::STRDroW }
   };
 
   unsigned Opc;
   bool VTIsi1 = false;
   bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
                       Addr.getOffsetReg();
   unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
   if (Addr.getExtendType() == AArch64_AM::UXTW ||
       Addr.getExtendType() == AArch64_AM::SXTW)
     Idx++;
 
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type.");
   case MVT::i1:  VTIsi1 = true; LLVM_FALLTHROUGH;
   case MVT::i8:  Opc = OpcTable[Idx][0]; break;
   case MVT::i16: Opc = OpcTable[Idx][1]; break;
   case MVT::i32: Opc = OpcTable[Idx][2]; break;
   case MVT::i64: Opc = OpcTable[Idx][3]; break;
   case MVT::f32: Opc = OpcTable[Idx][4]; break;
   case MVT::f64: Opc = OpcTable[Idx][5]; break;
   }
 
   // Storing an i1 requires special handling.
   if (VTIsi1 && SrcReg != AArch64::WZR) {
     unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
     assert(ANDReg && "Unexpected AND instruction emission failure.");
     SrcReg = ANDReg;
   }
   // Create the base instruction, then add the operands.
   const MCInstrDesc &II = TII.get(Opc);
   SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
   MachineInstrBuilder MIB =
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(SrcReg);
   addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, ScaleFactor, MMO);
 
   return true;
 }
 
 bool AArch64FastISel::selectStore(const Instruction *I) {
   MVT VT;
   const Value *Op0 = I->getOperand(0);
   // Verify we have a legal type before going any further.  Currently, we handle
   // simple types that will directly fit in a register (i32/f32/i64/f64) or
   // those that can be sign or zero-extended to a basic operation (i1/i8/i16).
   if (!isTypeSupported(Op0->getType(), VT, /*IsVectorAllowed=*/true))
     return false;
 
   const Value *PtrV = I->getOperand(1);
   if (TLI.supportSwiftError()) {
     // Swifterror values can come from either a function parameter with
     // swifterror attribute or an alloca with swifterror attribute.
     if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
       if (Arg->hasSwiftErrorAttr())
         return false;
     }
 
     if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
       if (Alloca->isSwiftError())
         return false;
     }
   }
 
   // Get the value to be stored into a register. Use the zero register directly
   // when possible to avoid an unnecessary copy and a wasted register.
   unsigned SrcReg = 0;
   if (const auto *CI = dyn_cast<ConstantInt>(Op0)) {
     if (CI->isZero())
       SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
   } else if (const auto *CF = dyn_cast<ConstantFP>(Op0)) {
     if (CF->isZero() && !CF->isNegative()) {
       VT = MVT::getIntegerVT(VT.getSizeInBits());
       SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
     }
   }
 
   if (!SrcReg)
     SrcReg = getRegForValue(Op0);
 
   if (!SrcReg)
     return false;
 
   auto *SI = cast<StoreInst>(I);
 
   // Try to emit a STLR for seq_cst/release.
   if (SI->isAtomic()) {
     AtomicOrdering Ord = SI->getOrdering();
     // The non-atomic instructions are sufficient for relaxed stores.
     if (isReleaseOrStronger(Ord)) {
       // The STLR addressing mode only supports a base reg; pass that directly.
       unsigned AddrReg = getRegForValue(PtrV);
       return emitStoreRelease(VT, SrcReg, AddrReg,
                               createMachineMemOperandFor(I));
     }
   }
 
   // See if we can handle this address.
   Address Addr;
   if (!computeAddress(PtrV, Addr, Op0->getType()))
     return false;
 
   if (!emitStore(VT, SrcReg, Addr, createMachineMemOperandFor(I)))
     return false;
   return true;
 }
 
 static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
   switch (Pred) {
   case CmpInst::FCMP_ONE:
   case CmpInst::FCMP_UEQ:
   default:
     // AL is our "false" for now. The other two need more compares.
     return AArch64CC::AL;
   case CmpInst::ICMP_EQ:
   case CmpInst::FCMP_OEQ:
     return AArch64CC::EQ;
   case CmpInst::ICMP_SGT:
   case CmpInst::FCMP_OGT:
     return AArch64CC::GT;
   case CmpInst::ICMP_SGE:
   case CmpInst::FCMP_OGE:
     return AArch64CC::GE;
   case CmpInst::ICMP_UGT:
   case CmpInst::FCMP_UGT:
     return AArch64CC::HI;
   case CmpInst::FCMP_OLT:
     return AArch64CC::MI;
   case CmpInst::ICMP_ULE:
   case CmpInst::FCMP_OLE:
     return AArch64CC::LS;
   case CmpInst::FCMP_ORD:
     return AArch64CC::VC;
   case CmpInst::FCMP_UNO:
     return AArch64CC::VS;
   case CmpInst::FCMP_UGE:
     return AArch64CC::PL;
   case CmpInst::ICMP_SLT:
   case CmpInst::FCMP_ULT:
     return AArch64CC::LT;
   case CmpInst::ICMP_SLE:
   case CmpInst::FCMP_ULE:
     return AArch64CC::LE;
   case CmpInst::FCMP_UNE:
   case CmpInst::ICMP_NE:
     return AArch64CC::NE;
   case CmpInst::ICMP_UGE:
     return AArch64CC::HS;
   case CmpInst::ICMP_ULT:
     return AArch64CC::LO;
   }
 }
 
 /// \brief Try to emit a combined compare-and-branch instruction.
 bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
   assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
   const CmpInst *CI = cast<CmpInst>(BI->getCondition());
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
 
   const Value *LHS = CI->getOperand(0);
   const Value *RHS = CI->getOperand(1);
 
   MVT VT;
   if (!isTypeSupported(LHS->getType(), VT))
     return false;
 
   unsigned BW = VT.getSizeInBits();
   if (BW > 64)
     return false;
 
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
   // Try to take advantage of fallthrough opportunities.
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
     Predicate = CmpInst::getInversePredicate(Predicate);
   }
 
   int TestBit = -1;
   bool IsCmpNE;
   switch (Predicate) {
   default:
     return false;
   case CmpInst::ICMP_EQ:
   case CmpInst::ICMP_NE:
     if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue())
       std::swap(LHS, RHS);
 
     if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
       return false;
 
     if (const auto *AI = dyn_cast<BinaryOperator>(LHS))
       if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) {
         const Value *AndLHS = AI->getOperand(0);
         const Value *AndRHS = AI->getOperand(1);
 
         if (const auto *C = dyn_cast<ConstantInt>(AndLHS))
           if (C->getValue().isPowerOf2())
             std::swap(AndLHS, AndRHS);
 
         if (const auto *C = dyn_cast<ConstantInt>(AndRHS))
           if (C->getValue().isPowerOf2()) {
             TestBit = C->getValue().logBase2();
             LHS = AndLHS;
           }
       }
 
     if (VT == MVT::i1)
       TestBit = 0;
 
     IsCmpNE = Predicate == CmpInst::ICMP_NE;
     break;
   case CmpInst::ICMP_SLT:
   case CmpInst::ICMP_SGE:
     if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
       return false;
 
     TestBit = BW - 1;
     IsCmpNE = Predicate == CmpInst::ICMP_SLT;
     break;
   case CmpInst::ICMP_SGT:
   case CmpInst::ICMP_SLE:
     if (!isa<ConstantInt>(RHS))
       return false;
 
     if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true))
       return false;
 
     TestBit = BW - 1;
     IsCmpNE = Predicate == CmpInst::ICMP_SLE;
     break;
   } // end switch
 
   static const unsigned OpcTable[2][2][2] = {
     { {AArch64::CBZW,  AArch64::CBZX },
       {AArch64::CBNZW, AArch64::CBNZX} },
     { {AArch64::TBZW,  AArch64::TBZX },
       {AArch64::TBNZW, AArch64::TBNZX} }
   };
 
   bool IsBitTest = TestBit != -1;
   bool Is64Bit = BW == 64;
   if (TestBit < 32 && TestBit >= 0)
     Is64Bit = false;
 
   unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit];
   const MCInstrDesc &II = TII.get(Opc);
 
   unsigned SrcReg = getRegForValue(LHS);
   if (!SrcReg)
     return false;
   bool SrcIsKill = hasTrivialKill(LHS);
 
   if (BW == 64 && !Is64Bit)
     SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
                                         AArch64::sub_32);
 
   if ((BW < 32) && !IsBitTest)
     SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true);
 
   // Emit the combined compare and branch instruction.
   SrcReg = constrainOperandRegClass(II, SrcReg,  II.getNumDefs());
   MachineInstrBuilder MIB =
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
           .addReg(SrcReg, getKillRegState(SrcIsKill));
   if (IsBitTest)
     MIB.addImm(TestBit);
   MIB.addMBB(TBB);
 
   finishCondBranch(BI->getParent(), TBB, FBB);
   return true;
 }
 
 bool AArch64FastISel::selectBranch(const Instruction *I) {
   const BranchInst *BI = cast<BranchInst>(I);
   if (BI->isUnconditional()) {
     MachineBasicBlock *MSucc = FuncInfo.MBBMap[BI->getSuccessor(0)];
     fastEmitBranch(MSucc, BI->getDebugLoc());
     return true;
   }
 
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && isValueAvailable(CI)) {
       // Try to optimize or fold the cmp.
       CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
       switch (Predicate) {
       default:
         break;
       case CmpInst::FCMP_FALSE:
         fastEmitBranch(FBB, DbgLoc);
         return true;
       case CmpInst::FCMP_TRUE:
         fastEmitBranch(TBB, DbgLoc);
         return true;
       }
 
       // Try to emit a combined compare-and-branch first.
       if (emitCompareAndBranch(BI))
         return true;
 
       // Try to take advantage of fallthrough opportunities.
       if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
         std::swap(TBB, FBB);
         Predicate = CmpInst::getInversePredicate(Predicate);
       }
 
       // Emit the cmp.
       if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
         return false;
 
       // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
       // instruction.
       AArch64CC::CondCode CC = getCompareCC(Predicate);
       AArch64CC::CondCode ExtraCC = AArch64CC::AL;
       switch (Predicate) {
       default:
         break;
       case CmpInst::FCMP_UEQ:
         ExtraCC = AArch64CC::EQ;
         CC = AArch64CC::VS;
         break;
       case CmpInst::FCMP_ONE:
         ExtraCC = AArch64CC::MI;
         CC = AArch64CC::GT;
         break;
       }
       assert((CC != AArch64CC::AL) && "Unexpected condition code.");
 
       // Emit the extra branch for FCMP_UEQ and FCMP_ONE.
       if (ExtraCC != AArch64CC::AL) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
             .addImm(ExtraCC)
             .addMBB(TBB);
       }
 
       // Emit the branch.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
 
       finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
   } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
         .addMBB(Target);
 
     // Obtain the branch probability and add the target to the successor list.
     if (FuncInfo.BPI) {
       auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
           BI->getParent(), Target->getBasicBlock());
       FuncInfo.MBB->addSuccessor(Target, BranchProbability);
     } else
       FuncInfo.MBB->addSuccessorWithoutProb(Target);
     return true;
   } else {
     AArch64CC::CondCode CC = AArch64CC::NE;
     if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
       // Fake request the condition, otherwise the intrinsic might be completely
       // optimized away.
       unsigned CondReg = getRegForValue(BI->getCondition());
       if (!CondReg)
         return false;
 
       // Emit the branch.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
         .addImm(CC)
         .addMBB(TBB);
 
       finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
   }
 
   unsigned CondReg = getRegForValue(BI->getCondition());
   if (CondReg == 0)
     return false;
   bool CondRegIsKill = hasTrivialKill(BI->getCondition());
 
   // i1 conditions come as i32 values, test the lowest bit with tb(n)z.
   unsigned Opcode = AArch64::TBNZW;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
     Opcode = AArch64::TBZW;
   }
 
   const MCInstrDesc &II = TII.get(Opcode);
   unsigned ConstrainedCondReg
     = constrainOperandRegClass(II, CondReg, II.getNumDefs());
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
       .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill))
       .addImm(0)
       .addMBB(TBB);
 
   finishCondBranch(BI->getParent(), TBB, FBB);
   return true;
 }
 
 bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
   const IndirectBrInst *BI = cast<IndirectBrInst>(I);
   unsigned AddrReg = getRegForValue(BI->getOperand(0));
   if (AddrReg == 0)
     return false;
 
   // Emit the indirect branch.
   const MCInstrDesc &II = TII.get(AArch64::BR);
   AddrReg = constrainOperandRegClass(II, AddrReg,  II.getNumDefs());
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
 
   // Make sure the CFG is up-to-date.
   for (auto *Succ : BI->successors())
     FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]);
 
   return true;
 }
 
 bool AArch64FastISel::selectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
 
   // Vectors of i1 are weird: bail out.
   if (CI->getType()->isVectorTy())
     return false;
 
   // Try to optimize or fold the cmp.
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
   unsigned ResultReg = 0;
   switch (Predicate) {
   default:
     break;
   case CmpInst::FCMP_FALSE:
     ResultReg = createResultReg(&AArch64::GPR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(AArch64::WZR, getKillRegState(true));
     break;
   case CmpInst::FCMP_TRUE:
     ResultReg = fastEmit_i(MVT::i32, MVT::i32, ISD::Constant, 1);
     break;
   }
 
   if (ResultReg) {
     updateValueMap(I, ResultReg);
     return true;
   }
 
   // Emit the cmp.
   if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
     return false;
 
   ResultReg = createResultReg(&AArch64::GPR32RegClass);
 
   // FCMP_UEQ and FCMP_ONE cannot be checked with a single instruction. These
   // condition codes are inverted, because they are used by CSINC.
   static unsigned CondCodeTable[2][2] = {
     { AArch64CC::NE, AArch64CC::VC },
     { AArch64CC::PL, AArch64CC::LE }
   };
   unsigned *CondCodes = nullptr;
   switch (Predicate) {
   default:
     break;
   case CmpInst::FCMP_UEQ:
     CondCodes = &CondCodeTable[0][0];
     break;
   case CmpInst::FCMP_ONE:
     CondCodes = &CondCodeTable[1][0];
     break;
   }
 
   if (CondCodes) {
     unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
             TmpReg1)
         .addReg(AArch64::WZR, getKillRegState(true))
         .addReg(AArch64::WZR, getKillRegState(true))
         .addImm(CondCodes[0]);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
             ResultReg)
         .addReg(TmpReg1, getKillRegState(true))
         .addReg(AArch64::WZR, getKillRegState(true))
         .addImm(CondCodes[1]);
 
     updateValueMap(I, ResultReg);
     return true;
   }
 
   // Now set a register based on the comparison.
   AArch64CC::CondCode CC = getCompareCC(Predicate);
   assert((CC != AArch64CC::AL) && "Unexpected condition code.");
   AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
           ResultReg)
       .addReg(AArch64::WZR, getKillRegState(true))
       .addReg(AArch64::WZR, getKillRegState(true))
       .addImm(invertedCC);
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 /// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false'
 /// value.
 bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
   if (!SI->getType()->isIntegerTy(1))
     return false;
 
   const Value *Src1Val, *Src2Val;
   unsigned Opc = 0;
   bool NeedExtraOp = false;
   if (auto *CI = dyn_cast<ConstantInt>(SI->getTrueValue())) {
     if (CI->isOne()) {
       Src1Val = SI->getCondition();
       Src2Val = SI->getFalseValue();
       Opc = AArch64::ORRWrr;
     } else {
       assert(CI->isZero());
       Src1Val = SI->getFalseValue();
       Src2Val = SI->getCondition();
       Opc = AArch64::BICWrr;
     }
   } else if (auto *CI = dyn_cast<ConstantInt>(SI->getFalseValue())) {
     if (CI->isOne()) {
       Src1Val = SI->getCondition();
       Src2Val = SI->getTrueValue();
       Opc = AArch64::ORRWrr;
       NeedExtraOp = true;
     } else {
       assert(CI->isZero());
       Src1Val = SI->getCondition();
       Src2Val = SI->getTrueValue();
       Opc = AArch64::ANDWrr;
     }
   }
 
   if (!Opc)
     return false;
 
   unsigned Src1Reg = getRegForValue(Src1Val);
   if (!Src1Reg)
     return false;
   bool Src1IsKill = hasTrivialKill(Src1Val);
 
   unsigned Src2Reg = getRegForValue(Src2Val);
   if (!Src2Reg)
     return false;
   bool Src2IsKill = hasTrivialKill(Src2Val);
 
   if (NeedExtraOp) {
     Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1);
     Src1IsKill = true;
   }
   unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg,
                                        Src1IsKill, Src2Reg, Src2IsKill);
   updateValueMap(SI, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectSelect(const Instruction *I) {
   assert(isa<SelectInst>(I) && "Expected a select instruction.");
   MVT VT;
   if (!isTypeSupported(I->getType(), VT))
     return false;
 
   unsigned Opc;
   const TargetRegisterClass *RC;
   switch (VT.SimpleTy) {
   default:
     return false;
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
     Opc = AArch64::CSELWr;
     RC = &AArch64::GPR32RegClass;
     break;
   case MVT::i64:
     Opc = AArch64::CSELXr;
     RC = &AArch64::GPR64RegClass;
     break;
   case MVT::f32:
     Opc = AArch64::FCSELSrrr;
     RC = &AArch64::FPR32RegClass;
     break;
   case MVT::f64:
     Opc = AArch64::FCSELDrrr;
     RC = &AArch64::FPR64RegClass;
     break;
   }
 
   const SelectInst *SI = cast<SelectInst>(I);
   const Value *Cond = SI->getCondition();
   AArch64CC::CondCode CC = AArch64CC::NE;
   AArch64CC::CondCode ExtraCC = AArch64CC::AL;
 
   if (optimizeSelect(SI))
     return true;
 
   // Try to pickup the flags, so we don't have to emit another compare.
   if (foldXALUIntrinsic(CC, I, Cond)) {
     // Fake request the condition to force emission of the XALU intrinsic.
     unsigned CondReg = getRegForValue(Cond);
     if (!CondReg)
       return false;
   } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() &&
              isValueAvailable(Cond)) {
     const auto *Cmp = cast<CmpInst>(Cond);
     // Try to optimize or fold the cmp.
     CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp);
     const Value *FoldSelect = nullptr;
     switch (Predicate) {
     default:
       break;
     case CmpInst::FCMP_FALSE:
       FoldSelect = SI->getFalseValue();
       break;
     case CmpInst::FCMP_TRUE:
       FoldSelect = SI->getTrueValue();
       break;
     }
 
     if (FoldSelect) {
       unsigned SrcReg = getRegForValue(FoldSelect);
       if (!SrcReg)
         return false;
       unsigned UseReg = lookUpRegForValue(SI);
       if (UseReg)
         MRI.clearKillFlags(UseReg);
 
       updateValueMap(I, SrcReg);
       return true;
     }
 
     // Emit the cmp.
     if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned()))
       return false;
 
     // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction.
     CC = getCompareCC(Predicate);
     switch (Predicate) {
     default:
       break;
     case CmpInst::FCMP_UEQ:
       ExtraCC = AArch64CC::EQ;
       CC = AArch64CC::VS;
       break;
     case CmpInst::FCMP_ONE:
       ExtraCC = AArch64CC::MI;
       CC = AArch64CC::GT;
       break;
     }
     assert((CC != AArch64CC::AL) && "Unexpected condition code.");
   } else {
     unsigned CondReg = getRegForValue(Cond);
     if (!CondReg)
       return false;
     bool CondIsKill = hasTrivialKill(Cond);
 
     const MCInstrDesc &II = TII.get(AArch64::ANDSWri);
     CondReg = constrainOperandRegClass(II, CondReg, 1);
 
     // Emit a TST instruction (ANDS wzr, reg, #imm).
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II,
             AArch64::WZR)
         .addReg(CondReg, getKillRegState(CondIsKill))
         .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
   }
 
   unsigned Src1Reg = getRegForValue(SI->getTrueValue());
   bool Src1IsKill = hasTrivialKill(SI->getTrueValue());
 
   unsigned Src2Reg = getRegForValue(SI->getFalseValue());
   bool Src2IsKill = hasTrivialKill(SI->getFalseValue());
 
   if (!Src1Reg || !Src2Reg)
     return false;
 
   if (ExtraCC != AArch64CC::AL) {
     Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
                                Src2IsKill, ExtraCC);
     Src2IsKill = true;
   }
   unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
                                         Src2IsKill, CC);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectFPExt(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
     return false;
 
   unsigned Op = getRegForValue(V);
   if (Op == 0)
     return false;
 
   unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
           ResultReg).addReg(Op);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectFPTrunc(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
     return false;
 
   unsigned Op = getRegForValue(V);
   if (Op == 0)
     return false;
 
   unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
           ResultReg).addReg(Op);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 // FPToUI and FPToSI
 bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
 
   unsigned SrcReg = getRegForValue(I->getOperand(0));
   if (SrcReg == 0)
     return false;
 
   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
   if (SrcVT == MVT::f128 || SrcVT == MVT::f16)
     return false;
 
   unsigned Opc;
   if (SrcVT == MVT::f64) {
     if (Signed)
       Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
     else
       Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
   } else {
     if (Signed)
       Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
     else
       Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
   }
   unsigned ResultReg = createResultReg(
       DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(SrcReg);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
   // Let regular ISEL handle FP16
   if (DestVT == MVT::f16)
     return false;
 
   assert((DestVT == MVT::f32 || DestVT == MVT::f64) &&
          "Unexpected value type.");
 
   unsigned SrcReg = getRegForValue(I->getOperand(0));
   if (!SrcReg)
     return false;
   bool SrcIsKill = hasTrivialKill(I->getOperand(0));
 
   EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
 
   // Handle sign-extension.
   if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
     SrcReg =
         emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
     if (!SrcReg)
       return false;
     SrcIsKill = true;
   }
 
   unsigned Opc;
   if (SrcVT == MVT::i64) {
     if (Signed)
       Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
     else
       Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
   } else {
     if (Signed)
       Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
     else
       Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
   }
 
   unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg,
                                       SrcIsKill);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::fastLowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
   const Function *F = FuncInfo.Fn;
   if (F->isVarArg())
     return false;
 
   CallingConv::ID CC = F->getCallingConv();
   if (CC != CallingConv::C && CC != CallingConv::Swift)
     return false;
 
   // Only handle simple cases of up to 8 GPR and FPR each.
   unsigned GPRCnt = 0;
   unsigned FPRCnt = 0;
   for (auto const &Arg : F->args()) {
     if (Arg.hasAttribute(Attribute::ByVal) ||
         Arg.hasAttribute(Attribute::InReg) ||
         Arg.hasAttribute(Attribute::StructRet) ||
         Arg.hasAttribute(Attribute::SwiftSelf) ||
         Arg.hasAttribute(Attribute::SwiftError) ||
         Arg.hasAttribute(Attribute::Nest))
       return false;
 
     Type *ArgTy = Arg.getType();
     if (ArgTy->isStructTy() || ArgTy->isArrayTy())
       return false;
 
     EVT ArgVT = TLI.getValueType(DL, ArgTy);
     if (!ArgVT.isSimple())
       return false;
 
     MVT VT = ArgVT.getSimpleVT().SimpleTy;
     if (VT.isFloatingPoint() && !Subtarget->hasFPARMv8())
       return false;
 
     if (VT.isVector() &&
         (!Subtarget->hasNEON() || !Subtarget->isLittleEndian()))
       return false;
 
     if (VT >= MVT::i1 && VT <= MVT::i64)
       ++GPRCnt;
     else if ((VT >= MVT::f16 && VT <= MVT::f64) || VT.is64BitVector() ||
              VT.is128BitVector())
       ++FPRCnt;
     else
       return false;
 
     if (GPRCnt > 8 || FPRCnt > 8)
       return false;
   }
 
   static const MCPhysReg Registers[6][8] = {
     { AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
       AArch64::W5, AArch64::W6, AArch64::W7 },
     { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4,
       AArch64::X5, AArch64::X6, AArch64::X7 },
     { AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4,
       AArch64::H5, AArch64::H6, AArch64::H7 },
     { AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4,
       AArch64::S5, AArch64::S6, AArch64::S7 },
     { AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4,
       AArch64::D5, AArch64::D6, AArch64::D7 },
     { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
       AArch64::Q5, AArch64::Q6, AArch64::Q7 }
   };
 
   unsigned GPRIdx = 0;
   unsigned FPRIdx = 0;
   for (auto const &Arg : F->args()) {
     MVT VT = TLI.getSimpleValueType(DL, Arg.getType());
     unsigned SrcReg;
     const TargetRegisterClass *RC;
     if (VT >= MVT::i1 && VT <= MVT::i32) {
       SrcReg = Registers[0][GPRIdx++];
       RC = &AArch64::GPR32RegClass;
       VT = MVT::i32;
     } else if (VT == MVT::i64) {
       SrcReg = Registers[1][GPRIdx++];
       RC = &AArch64::GPR64RegClass;
     } else if (VT == MVT::f16) {
       SrcReg = Registers[2][FPRIdx++];
       RC = &AArch64::FPR16RegClass;
     } else if (VT ==  MVT::f32) {
       SrcReg = Registers[3][FPRIdx++];
       RC = &AArch64::FPR32RegClass;
     } else if ((VT == MVT::f64) || VT.is64BitVector()) {
       SrcReg = Registers[4][FPRIdx++];
       RC = &AArch64::FPR64RegClass;
     } else if (VT.is128BitVector()) {
       SrcReg = Registers[5][FPRIdx++];
       RC = &AArch64::FPR128RegClass;
     } else
       llvm_unreachable("Unexpected value type.");
 
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
     unsigned ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(DstReg, getKillRegState(true));
     updateValueMap(&Arg, ResultReg);
   }
   return true;
 }
 
 bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
                                       SmallVectorImpl<MVT> &OutVTs,
                                       unsigned &NumBytes) {
   CallingConv::ID CC = CLI.CallConv;
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
   CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
 
   // Get a count of how many bytes are to be pushed on the stack.
   NumBytes = CCInfo.getNextStackOffset();
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
     .addImm(NumBytes).addImm(0);
 
   // Process the args.
   for (CCValAssign &VA : ArgLocs) {
     const Value *ArgVal = CLI.OutVals[VA.getValNo()];
     MVT ArgVT = OutVTs[VA.getValNo()];
 
     unsigned ArgReg = getRegForValue(ArgVal);
     if (!ArgReg)
       return false;
 
     // Handle arg promotion: SExt, ZExt, AExt.
     switch (VA.getLocInfo()) {
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt: {
       MVT DestVT = VA.getLocVT();
       MVT SrcVT = ArgVT;
       ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
       if (!ArgReg)
         return false;
       break;
     }
     case CCValAssign::AExt:
     // Intentional fall-through.
     case CCValAssign::ZExt: {
       MVT DestVT = VA.getLocVT();
       MVT SrcVT = ArgVT;
       ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
       if (!ArgReg)
         return false;
       break;
     }
     default:
       llvm_unreachable("Unknown arg promotion!");
     }
 
     // Now copy/store arg to correct locations.
     if (VA.isRegLoc() && !VA.needsCustom()) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
       CLI.OutRegs.push_back(VA.getLocReg());
     } else if (VA.needsCustom()) {
       // FIXME: Handle custom args.
       return false;
     } else {
       assert(VA.isMemLoc() && "Assuming store on stack.");
 
       // Don't emit stores for undef values.
       if (isa<UndefValue>(ArgVal))
         continue;
 
       // Need to store on the stack.
       unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
 
       unsigned BEAlign = 0;
       if (ArgSize < 8 && !Subtarget->isLittleEndian())
         BEAlign = 8 - ArgSize;
 
       Address Addr;
       Addr.setKind(Address::RegBase);
       Addr.setReg(AArch64::SP);
       Addr.setOffset(VA.getLocMemOffset() + BEAlign);
 
       unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
           MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
           MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
 
       if (!emitStore(ArgVT, ArgReg, Addr, MMO))
         return false;
     }
   }
   return true;
 }
 
 bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
                                  unsigned NumBytes) {
   CallingConv::ID CC = CLI.CallConv;
 
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
     .addImm(NumBytes).addImm(0);
 
   // Now the return value.
   if (RetVT != MVT::isVoid) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
     CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
 
     // Only handle a single return value.
     if (RVLocs.size() != 1)
       return false;
 
     // Copy all of the result registers out of their specified physreg.
     MVT CopyVT = RVLocs[0].getValVT();
 
     // TODO: Handle big-endian results
     if (CopyVT.isVector() && !Subtarget->isLittleEndian())
       return false;
 
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(RVLocs[0].getLocReg());
     CLI.InRegs.push_back(RVLocs[0].getLocReg());
 
     CLI.ResultReg = ResultReg;
     CLI.NumResultRegs = 1;
   }
 
   return true;
 }
 
 bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   CallingConv::ID CC  = CLI.CallConv;
   bool IsTailCall     = CLI.IsTailCall;
   bool IsVarArg       = CLI.IsVarArg;
   const Value *Callee = CLI.Callee;
   MCSymbol *Symbol = CLI.Symbol;
 
   if (!Callee && !Symbol)
     return false;
 
   // Allow SelectionDAG isel to handle tail calls.
   if (IsTailCall)
     return false;
 
   CodeModel::Model CM = TM.getCodeModel();
   // Only support the small-addressing and large code models.
   if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
     return false;
 
   // FIXME: Add large code model support for ELF.
   if (CM == CodeModel::Large && !Subtarget->isTargetMachO())
     return false;
 
   // Let SDISel handle vararg functions.
   if (IsVarArg)
     return false;
 
   // FIXME: Only handle *simple* calls for now.
   MVT RetVT;
   if (CLI.RetTy->isVoidTy())
     RetVT = MVT::isVoid;
   else if (!isTypeLegal(CLI.RetTy, RetVT))
     return false;
 
   for (auto Flag : CLI.OutFlags)
     if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() ||
         Flag.isSwiftSelf() || Flag.isSwiftError())
       return false;
 
   // Set up the argument vectors.
   SmallVector<MVT, 16> OutVTs;
   OutVTs.reserve(CLI.OutVals.size());
 
   for (auto *Val : CLI.OutVals) {
     MVT VT;
     if (!isTypeLegal(Val->getType(), VT) &&
         !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
       return false;
 
     // We don't handle vector parameters yet.
     if (VT.isVector() || VT.getSizeInBits() > 64)
       return false;
 
     OutVTs.push_back(VT);
   }
 
   Address Addr;
   if (Callee && !computeCallAddress(Callee, Addr))
     return false;
 
   // Handle the arguments now that we've gotten them.
   unsigned NumBytes;
   if (!processCallArgs(CLI, OutVTs, NumBytes))
     return false;
 
   // Issue the call.
   MachineInstrBuilder MIB;
   if (Subtarget->useSmallAddressing()) {
     const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
     if (Symbol)
       MIB.addSym(Symbol, 0);
     else if (Addr.getGlobalValue())
       MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0);
     else if (Addr.getReg()) {
       unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0);
       MIB.addReg(Reg);
     } else
       return false;
   } else {
     unsigned CallReg = 0;
     if (Symbol) {
       unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
               ADRPReg)
           .addSym(Symbol, AArch64II::MO_GOT | AArch64II::MO_PAGE);
 
       CallReg = createResultReg(&AArch64::GPR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(AArch64::LDRXui), CallReg)
           .addReg(ADRPReg)
           .addSym(Symbol,
                   AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
     } else if (Addr.getGlobalValue())
       CallReg = materializeGV(Addr.getGlobalValue());
     else if (Addr.getReg())
       CallReg = Addr.getReg();
 
     if (!CallReg)
       return false;
 
     const MCInstrDesc &II = TII.get(AArch64::BLR);
     CallReg = constrainOperandRegClass(II, CallReg, 0);
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
   }
 
   // Add implicit physical register uses to the call.
   for (auto Reg : CLI.OutRegs)
     MIB.addReg(Reg, RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
   MIB.addRegMask(TRI.getCallPreservedMask(*FuncInfo.MF, CC));
 
   CLI.Call = MIB;
 
   // Finish off the call including any return values.
   return finishCall(CLI, RetVT, NumBytes);
 }
 
 bool AArch64FastISel::isMemCpySmall(uint64_t Len, unsigned Alignment) {
   if (Alignment)
     return Len / Alignment <= 4;
   else
     return Len < 32;
 }
 
 bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src,
                                          uint64_t Len, unsigned Alignment) {
   // Make sure we don't bloat code by inlining very large memcpy's.
   if (!isMemCpySmall(Len, Alignment))
     return false;
 
   int64_t UnscaledOffset = 0;
   Address OrigDest = Dest;
   Address OrigSrc = Src;
 
   while (Len) {
     MVT VT;
     if (!Alignment || Alignment >= 8) {
       if (Len >= 8)
         VT = MVT::i64;
       else if (Len >= 4)
         VT = MVT::i32;
       else if (Len >= 2)
         VT = MVT::i16;
       else {
         VT = MVT::i8;
       }
     } else {
       // Bound based on alignment.
       if (Len >= 4 && Alignment == 4)
         VT = MVT::i32;
       else if (Len >= 2 && Alignment == 2)
         VT = MVT::i16;
       else {
         VT = MVT::i8;
       }
     }
 
     unsigned ResultReg = emitLoad(VT, VT, Src);
     if (!ResultReg)
       return false;
 
     if (!emitStore(VT, ResultReg, Dest))
       return false;
 
     int64_t Size = VT.getSizeInBits() / 8;
     Len -= Size;
     UnscaledOffset += Size;
 
     // We need to recompute the unscaled offset for each iteration.
     Dest.setOffset(OrigDest.getOffset() + UnscaledOffset);
     Src.setOffset(OrigSrc.getOffset() + UnscaledOffset);
   }
 
   return true;
 }
 
 /// \brief Check if it is possible to fold the condition from the XALU intrinsic
 /// into the user. The condition code will only be updated on success.
 bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
                                         const Instruction *I,
                                         const Value *Cond) {
   if (!isa<ExtractValueInst>(Cond))
     return false;
 
   const auto *EV = cast<ExtractValueInst>(Cond);
   if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
     return false;
 
   const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
   MVT RetVT;
   const Function *Callee = II->getCalledFunction();
   Type *RetTy =
   cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
   if (!isTypeLegal(RetTy, RetVT))
     return false;
 
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return false;
 
   const Value *LHS = II->getArgOperand(0);
   const Value *RHS = II->getArgOperand(1);
 
   // Canonicalize immediate to the RHS.
   if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
       isCommutativeIntrinsic(II))
     std::swap(LHS, RHS);
 
   // Simplify multiplies.
   Intrinsic::ID IID = II->getIntrinsicID();
   switch (IID) {
   default:
     break;
   case Intrinsic::smul_with_overflow:
     if (const auto *C = dyn_cast<ConstantInt>(RHS))
       if (C->getValue() == 2)
         IID = Intrinsic::sadd_with_overflow;
     break;
   case Intrinsic::umul_with_overflow:
     if (const auto *C = dyn_cast<ConstantInt>(RHS))
       if (C->getValue() == 2)
         IID = Intrinsic::uadd_with_overflow;
     break;
   }
 
   AArch64CC::CondCode TmpCC;
   switch (IID) {
   default:
     return false;
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
     TmpCC = AArch64CC::VS;
     break;
   case Intrinsic::uadd_with_overflow:
     TmpCC = AArch64CC::HS;
     break;
   case Intrinsic::usub_with_overflow:
     TmpCC = AArch64CC::LO;
     break;
   case Intrinsic::smul_with_overflow:
   case Intrinsic::umul_with_overflow:
     TmpCC = AArch64CC::NE;
     break;
   }
 
   // Check if both instructions are in the same basic block.
   if (!isValueAvailable(II))
     return false;
 
   // Make sure nothing is in the way
   BasicBlock::const_iterator Start(I);
   BasicBlock::const_iterator End(II);
   for (auto Itr = std::prev(Start); Itr != End; --Itr) {
     // We only expect extractvalue instructions between the intrinsic and the
     // instruction to be selected.
     if (!isa<ExtractValueInst>(Itr))
       return false;
 
     // Check that the extractvalue operand comes from the intrinsic.
     const auto *EVI = cast<ExtractValueInst>(Itr);
     if (EVI->getAggregateOperand() != II)
       return false;
   }
 
   CC = TmpCC;
   return true;
 }
 
 bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
   // FIXME: Handle more intrinsics.
   switch (II->getIntrinsicID()) {
   default: return false;
   case Intrinsic::frameaddress: {
     MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
     MFI.setFrameAddressIsTaken(true);
 
     const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
     unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
     unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
     // Recursively load frame address
     // ldr x0, [fp]
     // ldr x0, [x0]
     // ldr x0, [x0]
     // ...
     unsigned DestReg;
     unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
     while (Depth--) {
       DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass,
                                 SrcReg, /*IsKill=*/true, 0);
       assert(DestReg && "Unexpected LDR instruction emission failure.");
       SrcReg = DestReg;
     }
 
     updateValueMap(II, SrcReg);
     return true;
   }
   case Intrinsic::memcpy:
   case Intrinsic::memmove: {
     const auto *MTI = cast<MemTransferInst>(II);
     // Don't handle volatile.
     if (MTI->isVolatile())
       return false;
 
     // Disable inlining for memmove before calls to ComputeAddress.  Otherwise,
     // we would emit dead code because we don't currently handle memmoves.
     bool IsMemCpy = (II->getIntrinsicID() == Intrinsic::memcpy);
     if (isa<ConstantInt>(MTI->getLength()) && IsMemCpy) {
       // Small memcpy's are common enough that we want to do them without a call
       // if possible.
       uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue();
       unsigned Alignment = MTI->getAlignment();
       if (isMemCpySmall(Len, Alignment)) {
         Address Dest, Src;
         if (!computeAddress(MTI->getRawDest(), Dest) ||
             !computeAddress(MTI->getRawSource(), Src))
           return false;
         if (tryEmitSmallMemCpy(Dest, Src, Len, Alignment))
           return true;
       }
     }
 
     if (!MTI->getLength()->getType()->isIntegerTy(64))
       return false;
 
     if (MTI->getSourceAddressSpace() > 255 || MTI->getDestAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
 
     const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
     return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
   }
   case Intrinsic::memset: {
     const MemSetInst *MSI = cast<MemSetInst>(II);
     // Don't handle volatile.
     if (MSI->isVolatile())
       return false;
 
     if (!MSI->getLength()->getType()->isIntegerTy(64))
       return false;
 
     if (MSI->getDestAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
 
     return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
   }
   case Intrinsic::sin:
   case Intrinsic::cos:
   case Intrinsic::pow: {
     MVT RetVT;
     if (!isTypeLegal(II->getType(), RetVT))
       return false;
 
     if (RetVT != MVT::f32 && RetVT != MVT::f64)
       return false;
 
     static const RTLIB::Libcall LibCallTable[3][2] = {
       { RTLIB::SIN_F32, RTLIB::SIN_F64 },
       { RTLIB::COS_F32, RTLIB::COS_F64 },
       { RTLIB::POW_F32, RTLIB::POW_F64 }
     };
     RTLIB::Libcall LC;
     bool Is64Bit = RetVT == MVT::f64;
     switch (II->getIntrinsicID()) {
     default:
       llvm_unreachable("Unexpected intrinsic.");
     case Intrinsic::sin:
       LC = LibCallTable[0][Is64Bit];
       break;
     case Intrinsic::cos:
       LC = LibCallTable[1][Is64Bit];
       break;
     case Intrinsic::pow:
       LC = LibCallTable[2][Is64Bit];
       break;
     }
 
     ArgListTy Args;
     Args.reserve(II->getNumArgOperands());
 
     // Populate the argument list.
     for (auto &Arg : II->arg_operands()) {
       ArgListEntry Entry;
       Entry.Val = Arg;
       Entry.Ty = Arg->getType();
       Args.push_back(Entry);
     }
 
     CallLoweringInfo CLI;
     MCContext &Ctx = MF->getContext();
     CLI.setCallee(DL, Ctx, TLI.getLibcallCallingConv(LC), II->getType(),
                   TLI.getLibcallName(LC), std::move(Args));
     if (!lowerCallTo(CLI))
       return false;
     updateValueMap(II, CLI.ResultReg);
     return true;
   }
   case Intrinsic::fabs: {
     MVT VT;
     if (!isTypeLegal(II->getType(), VT))
       return false;
 
     unsigned Opc;
     switch (VT.SimpleTy) {
     default:
       return false;
     case MVT::f32:
       Opc = AArch64::FABSSr;
       break;
     case MVT::f64:
       Opc = AArch64::FABSDr;
       break;
     }
     unsigned SrcReg = getRegForValue(II->getOperand(0));
     if (!SrcReg)
       return false;
     bool SrcRegIsKill = hasTrivialKill(II->getOperand(0));
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(SrcReg, getKillRegState(SrcRegIsKill));
     updateValueMap(II, ResultReg);
     return true;
   }
   case Intrinsic::trap:
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
         .addImm(1);
     return true;
 
   case Intrinsic::sqrt: {
     Type *RetTy = II->getCalledFunction()->getReturnType();
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
     unsigned Op0Reg = getRegForValue(II->getOperand(0));
     if (!Op0Reg)
       return false;
     bool Op0IsKill = hasTrivialKill(II->getOperand(0));
 
     unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill);
     if (!ResultReg)
       return false;
 
     updateValueMap(II, ResultReg);
     return true;
   }
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
   case Intrinsic::usub_with_overflow:
   case Intrinsic::smul_with_overflow:
   case Intrinsic::umul_with_overflow: {
     // This implements the basic lowering of the xalu with overflow intrinsics.
     const Function *Callee = II->getCalledFunction();
     auto *Ty = cast<StructType>(Callee->getReturnType());
     Type *RetTy = Ty->getTypeAtIndex(0U);
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
     if (VT != MVT::i32 && VT != MVT::i64)
       return false;
 
     const Value *LHS = II->getArgOperand(0);
     const Value *RHS = II->getArgOperand(1);
     // Canonicalize immediate to the RHS.
     if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
         isCommutativeIntrinsic(II))
       std::swap(LHS, RHS);
 
     // Simplify multiplies.
     Intrinsic::ID IID = II->getIntrinsicID();
     switch (IID) {
     default:
       break;
     case Intrinsic::smul_with_overflow:
       if (const auto *C = dyn_cast<ConstantInt>(RHS))
         if (C->getValue() == 2) {
           IID = Intrinsic::sadd_with_overflow;
           RHS = LHS;
         }
       break;
     case Intrinsic::umul_with_overflow:
       if (const auto *C = dyn_cast<ConstantInt>(RHS))
         if (C->getValue() == 2) {
           IID = Intrinsic::uadd_with_overflow;
           RHS = LHS;
         }
       break;
     }
 
     unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0;
     AArch64CC::CondCode CC = AArch64CC::Invalid;
     switch (IID) {
     default: llvm_unreachable("Unexpected intrinsic!");
     case Intrinsic::sadd_with_overflow:
       ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
       CC = AArch64CC::VS;
       break;
     case Intrinsic::uadd_with_overflow:
       ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
       CC = AArch64CC::HS;
       break;
     case Intrinsic::ssub_with_overflow:
       ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
       CC = AArch64CC::VS;
       break;
     case Intrinsic::usub_with_overflow:
       ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
       CC = AArch64CC::LO;
       break;
     case Intrinsic::smul_with_overflow: {
       CC = AArch64CC::NE;
       unsigned LHSReg = getRegForValue(LHS);
       if (!LHSReg)
         return false;
       bool LHSIsKill = hasTrivialKill(LHS);
 
       unsigned RHSReg = getRegForValue(RHS);
       if (!RHSReg)
         return false;
       bool RHSIsKill = hasTrivialKill(RHS);
 
       if (VT == MVT::i32) {
         MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
         unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg,
                                        /*IsKill=*/false, 32);
         MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
                                             AArch64::sub_32);
         ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true,
                                               AArch64::sub_32);
         emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
                     AArch64_AM::ASR, 31, /*WantResult=*/false);
       } else {
         assert(VT == MVT::i64 && "Unexpected value type.");
         // LHSReg and RHSReg cannot be killed by this Mul, since they are
         // reused in the next instruction.
         MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg,
                             /*IsKill=*/false);
         unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill,
                                         RHSReg, RHSIsKill);
         emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
                     AArch64_AM::ASR, 63, /*WantResult=*/false);
       }
       break;
     }
     case Intrinsic::umul_with_overflow: {
       CC = AArch64CC::NE;
       unsigned LHSReg = getRegForValue(LHS);
       if (!LHSReg)
         return false;
       bool LHSIsKill = hasTrivialKill(LHS);
 
       unsigned RHSReg = getRegForValue(RHS);
       if (!RHSReg)
         return false;
       bool RHSIsKill = hasTrivialKill(RHS);
 
       if (VT == MVT::i32) {
         MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
         emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg,
                     /*IsKill=*/false, AArch64_AM::LSR, 32,
                     /*WantResult=*/false);
         MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
                                             AArch64::sub_32);
       } else {
         assert(VT == MVT::i64 && "Unexpected value type.");
         // LHSReg and RHSReg cannot be killed by this Mul, since they are
         // reused in the next instruction.
         MulReg = emitMul_rr(VT, LHSReg, /*IsKill=*/false, RHSReg,
                             /*IsKill=*/false);
         unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill,
                                         RHSReg, RHSIsKill);
         emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg,
                     /*IsKill=*/false, /*WantResult=*/false);
       }
       break;
     }
     }
 
     if (MulReg) {
       ResultReg1 = createResultReg(TLI.getRegClassFor(VT));
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg1).addReg(MulReg);
     }
 
     ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass,
                                   AArch64::WZR, /*IsKill=*/true, AArch64::WZR,
                                   /*IsKill=*/true, getInvertedCondCode(CC));
     (void)ResultReg2;
     assert((ResultReg1 + 1) == ResultReg2 &&
            "Nonconsecutive result registers.");
     updateValueMap(II, ResultReg1, 2);
     return true;
   }
   }
   return false;
 }
 
 bool AArch64FastISel::selectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
 
   if (!FuncInfo.CanLowerReturn)
     return false;
 
   if (F.isVarArg())
     return false;
 
   if (TLI.supportSwiftError() &&
       F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
     return false;
 
   if (TLI.supportSplitCSR(FuncInfo.MF))
     return false;
 
   // Build a list of return value registers.
   SmallVector<unsigned, 4> RetRegs;
 
   if (Ret->getNumOperands() > 0) {
     CallingConv::ID CC = F.getCallingConv();
     SmallVector<ISD::OutputArg, 4> Outs;
     GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI, DL);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
     CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
                                                      : RetCC_AArch64_AAPCS;
     CCInfo.AnalyzeReturn(Outs, RetCC);
 
     // Only handle a single return value for now.
     if (ValLocs.size() != 1)
       return false;
 
     CCValAssign &VA = ValLocs[0];
     const Value *RV = Ret->getOperand(0);
 
     // Don't bother handling odd stuff for now.
     if ((VA.getLocInfo() != CCValAssign::Full) &&
         (VA.getLocInfo() != CCValAssign::BCvt))
       return false;
 
     // Only handle register returns for now.
     if (!VA.isRegLoc())
       return false;
 
     unsigned Reg = getRegForValue(RV);
     if (Reg == 0)
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
     unsigned DestReg = VA.getLocReg();
     // Avoid a cross-class copy. This is very unlikely.
     if (!MRI.getRegClass(SrcReg)->contains(DestReg))
       return false;
 
     EVT RVEVT = TLI.getValueType(DL, RV->getType());
     if (!RVEVT.isSimple())
       return false;
 
     // Vectors (of > 1 lane) in big endian need tricky handling.
     if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 &&
         !Subtarget->isLittleEndian())
       return false;
 
     MVT RVVT = RVEVT.getSimpleVT();
     if (RVVT == MVT::f128)
       return false;
 
     MVT DestVT = VA.getValVT();
     // Special handling for extended integers.
     if (RVVT != DestVT) {
       if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
         return false;
 
       if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
         return false;
 
       bool IsZExt = Outs[0].Flags.isZExt();
       SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
       if (SrcReg == 0)
         return false;
     }
 
     // Make the copy.
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
 
     // Add register to return instruction.
     RetRegs.push_back(VA.getLocReg());
   }
 
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(AArch64::RET_ReallyLR));
   for (unsigned RetReg : RetRegs)
     MIB.addReg(RetReg, RegState::Implicit);
   return true;
 }
 
 bool AArch64FastISel::selectTrunc(const Instruction *I) {
   Type *DestTy = I->getType();
   Value *Op = I->getOperand(0);
   Type *SrcTy = Op->getType();
 
   EVT SrcEVT = TLI.getValueType(DL, SrcTy, true);
   EVT DestEVT = TLI.getValueType(DL, DestTy, true);
   if (!SrcEVT.isSimple())
     return false;
   if (!DestEVT.isSimple())
     return false;
 
   MVT SrcVT = SrcEVT.getSimpleVT();
   MVT DestVT = DestEVT.getSimpleVT();
 
   if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
       SrcVT != MVT::i8)
     return false;
   if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 &&
       DestVT != MVT::i1)
     return false;
 
   unsigned SrcReg = getRegForValue(Op);
   if (!SrcReg)
     return false;
   bool SrcIsKill = hasTrivialKill(Op);
 
   // If we're truncating from i64 to a smaller non-legal type then generate an
   // AND. Otherwise, we know the high bits are undefined and a truncate only
   // generate a COPY. We cannot mark the source register also as result
   // register, because this can incorrectly transfer the kill flag onto the
   // source register.
   unsigned ResultReg;
   if (SrcVT == MVT::i64) {
     uint64_t Mask = 0;
     switch (DestVT.SimpleTy) {
     default:
       // Trunc i64 to i32 is handled by the target-independent fast-isel.
       return false;
     case MVT::i1:
       Mask = 0x1;
       break;
     case MVT::i8:
       Mask = 0xff;
       break;
     case MVT::i16:
       Mask = 0xffff;
       break;
     }
     // Issue an extract_subreg to get the lower 32-bits.
     unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
                                                 AArch64::sub_32);
     // Create the AND instruction which performs the actual truncation.
     ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask);
     assert(ResultReg && "Unexpected AND instruction emission failure.");
   } else {
     ResultReg = createResultReg(&AArch64::GPR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(SrcReg, getKillRegState(SrcIsKill));
   }
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
   assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
           DestVT == MVT::i64) &&
          "Unexpected value type.");
   // Handle i8 and i16 as i32.
   if (DestVT == MVT::i8 || DestVT == MVT::i16)
     DestVT = MVT::i32;
 
   if (IsZExt) {
     unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
     assert(ResultReg && "Unexpected AND instruction emission failure.");
     if (DestVT == MVT::i64) {
       // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
       // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
       unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(AArch64::SUBREG_TO_REG), Reg64)
           .addImm(0)
           .addReg(ResultReg)
           .addImm(AArch64::sub_32);
       ResultReg = Reg64;
     }
     return ResultReg;
   } else {
     if (DestVT == MVT::i64) {
       // FIXME: We're SExt i1 to i64.
       return 0;
     }
     return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg,
                             /*TODO:IsKill=*/false, 0, 0);
   }
 }
 
 unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
                                       unsigned Op1, bool Op1IsKill) {
   unsigned Opc, ZReg;
   switch (RetVT.SimpleTy) {
   default: return 0;
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
     RetVT = MVT::i32;
     Opc = AArch64::MADDWrrr; ZReg = AArch64::WZR; break;
   case MVT::i64:
     Opc = AArch64::MADDXrrr; ZReg = AArch64::XZR; break;
   }
 
   const TargetRegisterClass *RC =
       (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill,
                           /*IsKill=*/ZReg, true);
 }
 
 unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
                                         unsigned Op1, bool Op1IsKill) {
   if (RetVT != MVT::i64)
     return 0;
 
   return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass,
                           Op0, Op0IsKill, Op1, Op1IsKill,
                           AArch64::XZR, /*IsKill=*/true);
 }
 
 unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
                                         unsigned Op1, bool Op1IsKill) {
   if (RetVT != MVT::i64)
     return 0;
 
   return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass,
                           Op0, Op0IsKill, Op1, Op1IsKill,
                           AArch64::XZR, /*IsKill=*/true);
 }
 
 unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
                                      unsigned Op1Reg, bool Op1IsKill) {
   unsigned Opc = 0;
   bool NeedTrunc = false;
   uint64_t Mask = 0;
   switch (RetVT.SimpleTy) {
   default: return 0;
   case MVT::i8:  Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xff;   break;
   case MVT::i16: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xffff; break;
   case MVT::i32: Opc = AArch64::LSLVWr;                                  break;
   case MVT::i64: Opc = AArch64::LSLVXr;                                  break;
   }
 
   const TargetRegisterClass *RC =
       (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   if (NeedTrunc) {
     Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
     Op1IsKill = true;
   }
   unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
                                        Op1IsKill);
   if (NeedTrunc)
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
                                      bool Op0IsKill, uint64_t Shift,
                                      bool IsZExt) {
   assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
          "Unexpected source/return type pair.");
   assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
           SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
          "Unexpected source value type.");
   assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
           RetVT == MVT::i64) && "Unexpected return value type.");
 
   bool Is64Bit = (RetVT == MVT::i64);
   unsigned RegSize = Is64Bit ? 64 : 32;
   unsigned DstBits = RetVT.getSizeInBits();
   unsigned SrcBits = SrcVT.getSizeInBits();
   const TargetRegisterClass *RC =
       Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
 
   // Just emit a copy for "zero" shifts.
   if (Shift == 0) {
     if (RetVT == SrcVT) {
       unsigned ResultReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
           .addReg(Op0, getKillRegState(Op0IsKill));
       return ResultReg;
     } else
       return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
   }
 
   // Don't deal with undefined shifts.
   if (Shift >= DstBits)
     return 0;
 
   // For immediate shifts we can fold the zero-/sign-extension into the shift.
   // {S|U}BFM Wd, Wn, #r, #s
   // Wd<32+s-r,32-r> = Wn<s:0> when r > s
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = shl i16 %1, 4
   // Wd<32+7-28,32-28> = Wn<7:0> <- clamp s to 7
   // 0b1111_1111_1111_1111__1111_1010_1010_0000 sext
   // 0b0000_0000_0000_0000__0000_0101_0101_0000 sext | zext
   // 0b0000_0000_0000_0000__0000_1010_1010_0000 zext
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = shl i16 %1, 8
   // Wd<32+7-24,32-24> = Wn<7:0>
   // 0b1111_1111_1111_1111__1010_1010_0000_0000 sext
   // 0b0000_0000_0000_0000__0101_0101_0000_0000 sext | zext
   // 0b0000_0000_0000_0000__1010_1010_0000_0000 zext
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = shl i16 %1, 12
   // Wd<32+3-20,32-20> = Wn<3:0>
   // 0b1111_1111_1111_1111__1010_0000_0000_0000 sext
   // 0b0000_0000_0000_0000__0101_0000_0000_0000 sext | zext
   // 0b0000_0000_0000_0000__1010_0000_0000_0000 zext
 
   unsigned ImmR = RegSize - Shift;
   // Limit the width to the length of the source type.
   unsigned ImmS = std::min<unsigned>(SrcBits - 1, DstBits - 1 - Shift);
   static const unsigned OpcTable[2][2] = {
     {AArch64::SBFMWri, AArch64::SBFMXri},
     {AArch64::UBFMWri, AArch64::UBFMXri}
   };
   unsigned Opc = OpcTable[IsZExt][Is64Bit];
   if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
     unsigned TmpReg = MRI.createVirtualRegister(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), TmpReg)
         .addImm(0)
         .addReg(Op0, getKillRegState(Op0IsKill))
         .addImm(AArch64::sub_32);
     Op0 = TmpReg;
     Op0IsKill = true;
   }
   return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
 }
 
 unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
                                      unsigned Op1Reg, bool Op1IsKill) {
   unsigned Opc = 0;
   bool NeedTrunc = false;
   uint64_t Mask = 0;
   switch (RetVT.SimpleTy) {
   default: return 0;
   case MVT::i8:  Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xff;   break;
   case MVT::i16: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xffff; break;
   case MVT::i32: Opc = AArch64::LSRVWr; break;
   case MVT::i64: Opc = AArch64::LSRVXr; break;
   }
 
   const TargetRegisterClass *RC =
       (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   if (NeedTrunc) {
     Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask);
     Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
     Op0IsKill = Op1IsKill = true;
   }
   unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
                                        Op1IsKill);
   if (NeedTrunc)
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
                                      bool Op0IsKill, uint64_t Shift,
                                      bool IsZExt) {
   assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
          "Unexpected source/return type pair.");
   assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
           SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
          "Unexpected source value type.");
   assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
           RetVT == MVT::i64) && "Unexpected return value type.");
 
   bool Is64Bit = (RetVT == MVT::i64);
   unsigned RegSize = Is64Bit ? 64 : 32;
   unsigned DstBits = RetVT.getSizeInBits();
   unsigned SrcBits = SrcVT.getSizeInBits();
   const TargetRegisterClass *RC =
       Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
 
   // Just emit a copy for "zero" shifts.
   if (Shift == 0) {
     if (RetVT == SrcVT) {
       unsigned ResultReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
       .addReg(Op0, getKillRegState(Op0IsKill));
       return ResultReg;
     } else
       return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
   }
 
   // Don't deal with undefined shifts.
   if (Shift >= DstBits)
     return 0;
 
   // For immediate shifts we can fold the zero-/sign-extension into the shift.
   // {S|U}BFM Wd, Wn, #r, #s
   // Wd<s-r:0> = Wn<s:r> when r <= s
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = lshr i16 %1, 4
   // Wd<7-4:0> = Wn<7:4>
   // 0b0000_0000_0000_0000__0000_1111_1111_1010 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
   // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = lshr i16 %1, 8
   // Wd<7-7,0> = Wn<7:7>
   // 0b0000_0000_0000_0000__0000_0000_1111_1111 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = lshr i16 %1, 12
   // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
   // 0b0000_0000_0000_0000__0000_0000_0000_1111 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
 
   if (Shift >= SrcBits && IsZExt)
     return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
 
   // It is not possible to fold a sign-extend into the LShr instruction. In this
   // case emit a sign-extend.
   if (!IsZExt) {
     Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt);
     if (!Op0)
       return 0;
     Op0IsKill = true;
     SrcVT = RetVT;
     SrcBits = SrcVT.getSizeInBits();
     IsZExt = true;
   }
 
   unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
   unsigned ImmS = SrcBits - 1;
   static const unsigned OpcTable[2][2] = {
     {AArch64::SBFMWri, AArch64::SBFMXri},
     {AArch64::UBFMWri, AArch64::UBFMXri}
   };
   unsigned Opc = OpcTable[IsZExt][Is64Bit];
   if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
     unsigned TmpReg = MRI.createVirtualRegister(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), TmpReg)
         .addImm(0)
         .addReg(Op0, getKillRegState(Op0IsKill))
         .addImm(AArch64::sub_32);
     Op0 = TmpReg;
     Op0IsKill = true;
   }
   return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
 }
 
 unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
                                      unsigned Op1Reg, bool Op1IsKill) {
   unsigned Opc = 0;
   bool NeedTrunc = false;
   uint64_t Mask = 0;
   switch (RetVT.SimpleTy) {
   default: return 0;
   case MVT::i8:  Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xff;   break;
   case MVT::i16: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xffff; break;
   case MVT::i32: Opc = AArch64::ASRVWr;                                  break;
   case MVT::i64: Opc = AArch64::ASRVXr;                                  break;
   }
 
   const TargetRegisterClass *RC =
       (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   if (NeedTrunc) {
     Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false);
     Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
     Op0IsKill = Op1IsKill = true;
   }
   unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
                                        Op1IsKill);
   if (NeedTrunc)
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
   return ResultReg;
 }
 
 unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
                                      bool Op0IsKill, uint64_t Shift,
                                      bool IsZExt) {
   assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
          "Unexpected source/return type pair.");
   assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
           SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
          "Unexpected source value type.");
   assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
           RetVT == MVT::i64) && "Unexpected return value type.");
 
   bool Is64Bit = (RetVT == MVT::i64);
   unsigned RegSize = Is64Bit ? 64 : 32;
   unsigned DstBits = RetVT.getSizeInBits();
   unsigned SrcBits = SrcVT.getSizeInBits();
   const TargetRegisterClass *RC =
       Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
 
   // Just emit a copy for "zero" shifts.
   if (Shift == 0) {
     if (RetVT == SrcVT) {
       unsigned ResultReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
       .addReg(Op0, getKillRegState(Op0IsKill));
       return ResultReg;
     } else
       return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
   }
 
   // Don't deal with undefined shifts.
   if (Shift >= DstBits)
     return 0;
 
   // For immediate shifts we can fold the zero-/sign-extension into the shift.
   // {S|U}BFM Wd, Wn, #r, #s
   // Wd<s-r:0> = Wn<s:r> when r <= s
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = ashr i16 %1, 4
   // Wd<7-4:0> = Wn<7:4>
   // 0b1111_1111_1111_1111__1111_1111_1111_1010 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
   // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = ashr i16 %1, 8
   // Wd<7-7,0> = Wn<7:7>
   // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
 
   // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
   // %2 = ashr i16 %1, 12
   // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
   // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
   // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
 
   if (Shift >= SrcBits && IsZExt)
     return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
 
   unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
   unsigned ImmS = SrcBits - 1;
   static const unsigned OpcTable[2][2] = {
     {AArch64::SBFMWri, AArch64::SBFMXri},
     {AArch64::UBFMWri, AArch64::UBFMXri}
   };
   unsigned Opc = OpcTable[IsZExt][Is64Bit];
   if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
     unsigned TmpReg = MRI.createVirtualRegister(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), TmpReg)
         .addImm(0)
         .addReg(Op0, getKillRegState(Op0IsKill))
         .addImm(AArch64::sub_32);
     Op0 = TmpReg;
     Op0IsKill = true;
   }
   return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
 }
 
 unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                      bool IsZExt) {
   assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
 
   // FastISel does not have plumbing to deal with extensions where the SrcVT or
   // DestVT are odd things, so test to make sure that they are both types we can
   // handle (i1/i8/i16/i32 for SrcVT and i8/i16/i32/i64 for DestVT), otherwise
   // bail out to SelectionDAG.
   if (((DestVT != MVT::i8) && (DestVT != MVT::i16) &&
        (DestVT != MVT::i32) && (DestVT != MVT::i64)) ||
       ((SrcVT !=  MVT::i1) && (SrcVT !=  MVT::i8) &&
        (SrcVT !=  MVT::i16) && (SrcVT !=  MVT::i32)))
     return 0;
 
   unsigned Opc;
   unsigned Imm = 0;
 
   switch (SrcVT.SimpleTy) {
   default:
     return 0;
   case MVT::i1:
     return emiti1Ext(SrcReg, DestVT, IsZExt);
   case MVT::i8:
     if (DestVT == MVT::i64)
       Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
       Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 7;
     break;
   case MVT::i16:
     if (DestVT == MVT::i64)
       Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
       Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 15;
     break;
   case MVT::i32:
     assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
     Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     Imm = 31;
     break;
   }
 
   // Handle i8 and i16 as i32.
   if (DestVT == MVT::i8 || DestVT == MVT::i16)
     DestVT = MVT::i32;
   else if (DestVT == MVT::i64) {
     unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), Src64)
         .addImm(0)
         .addReg(SrcReg)
         .addImm(AArch64::sub_32);
     SrcReg = Src64;
   }
 
   const TargetRegisterClass *RC =
       (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm);
 }
 
 static bool isZExtLoad(const MachineInstr *LI) {
   switch (LI->getOpcode()) {
   default:
     return false;
   case AArch64::LDURBBi:
   case AArch64::LDURHHi:
   case AArch64::LDURWi:
   case AArch64::LDRBBui:
   case AArch64::LDRHHui:
   case AArch64::LDRWui:
   case AArch64::LDRBBroX:
   case AArch64::LDRHHroX:
   case AArch64::LDRWroX:
   case AArch64::LDRBBroW:
   case AArch64::LDRHHroW:
   case AArch64::LDRWroW:
     return true;
   }
 }
 
 static bool isSExtLoad(const MachineInstr *LI) {
   switch (LI->getOpcode()) {
   default:
     return false;
   case AArch64::LDURSBWi:
   case AArch64::LDURSHWi:
   case AArch64::LDURSBXi:
   case AArch64::LDURSHXi:
   case AArch64::LDURSWi:
   case AArch64::LDRSBWui:
   case AArch64::LDRSHWui:
   case AArch64::LDRSBXui:
   case AArch64::LDRSHXui:
   case AArch64::LDRSWui:
   case AArch64::LDRSBWroX:
   case AArch64::LDRSHWroX:
   case AArch64::LDRSBXroX:
   case AArch64::LDRSHXroX:
   case AArch64::LDRSWroX:
   case AArch64::LDRSBWroW:
   case AArch64::LDRSHWroW:
   case AArch64::LDRSBXroW:
   case AArch64::LDRSHXroW:
   case AArch64::LDRSWroW:
     return true;
   }
 }
 
 bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
                                          MVT SrcVT) {
   const auto *LI = dyn_cast<LoadInst>(I->getOperand(0));
   if (!LI || !LI->hasOneUse())
     return false;
 
   // Check if the load instruction has already been selected.
   unsigned Reg = lookUpRegForValue(LI);
   if (!Reg)
     return false;
 
   MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
   if (!MI)
     return false;
 
   // Check if the correct load instruction has been emitted - SelectionDAG might
   // have emitted a zero-extending load, but we need a sign-extending load.
   bool IsZExt = isa<ZExtInst>(I);
   const auto *LoadMI = MI;
   if (LoadMI->getOpcode() == TargetOpcode::COPY &&
       LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
     unsigned LoadReg = MI->getOperand(1).getReg();
     LoadMI = MRI.getUniqueVRegDef(LoadReg);
     assert(LoadMI && "Expected valid instruction");
   }
   if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI)))
     return false;
 
   // Nothing to be done.
   if (RetVT != MVT::i64 || SrcVT > MVT::i32) {
     updateValueMap(I, Reg);
     return true;
   }
 
   if (IsZExt) {
     unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), Reg64)
         .addImm(0)
         .addReg(Reg, getKillRegState(true))
         .addImm(AArch64::sub_32);
     Reg = Reg64;
   } else {
     assert((MI->getOpcode() == TargetOpcode::COPY &&
             MI->getOperand(1).getSubReg() == AArch64::sub_32) &&
            "Expected copy instruction");
     Reg = MI->getOperand(1).getReg();
     MI->eraseFromParent();
   }
   updateValueMap(I, Reg);
   return true;
 }
 
 bool AArch64FastISel::selectIntExt(const Instruction *I) {
   assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
          "Unexpected integer extend instruction.");
   MVT RetVT;
   MVT SrcVT;
   if (!isTypeSupported(I->getType(), RetVT))
     return false;
 
   if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT))
     return false;
 
   // Try to optimize already sign-/zero-extended values from load instructions.
   if (optimizeIntExtLoad(I, RetVT, SrcVT))
     return true;
 
   unsigned SrcReg = getRegForValue(I->getOperand(0));
   if (!SrcReg)
     return false;
   bool SrcIsKill = hasTrivialKill(I->getOperand(0));
 
   // Try to optimize already sign-/zero-extended values from function arguments.
   bool IsZExt = isa<ZExtInst>(I);
   if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) {
     if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) {
       if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
         unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(AArch64::SUBREG_TO_REG), ResultReg)
             .addImm(0)
             .addReg(SrcReg, getKillRegState(SrcIsKill))
             .addImm(AArch64::sub_32);
         SrcReg = ResultReg;
       }
       // Conservatively clear all kill flags from all uses, because we are
       // replacing a sign-/zero-extend instruction at IR level with a nop at MI
       // level. The result of the instruction at IR level might have been
       // trivially dead, which is now not longer true.
       unsigned UseReg = lookUpRegForValue(I);
       if (UseReg)
         MRI.clearKillFlags(UseReg);
 
       updateValueMap(I, SrcReg);
       return true;
     }
   }
 
   unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt);
   if (!ResultReg)
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
   EVT DestEVT = TLI.getValueType(DL, I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
 
   MVT DestVT = DestEVT.getSimpleVT();
   if (DestVT != MVT::i64 && DestVT != MVT::i32)
     return false;
 
   unsigned DivOpc;
   bool Is64bit = (DestVT == MVT::i64);
   switch (ISDOpcode) {
   default:
     return false;
   case ISD::SREM:
     DivOpc = Is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
     break;
   case ISD::UREM:
     DivOpc = Is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
     break;
   }
   unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
   bool Src0IsKill = hasTrivialKill(I->getOperand(0));
 
   unsigned Src1Reg = getRegForValue(I->getOperand(1));
   if (!Src1Reg)
     return false;
   bool Src1IsKill = hasTrivialKill(I->getOperand(1));
 
   const TargetRegisterClass *RC =
       (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
   unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false,
                                      Src1Reg, /*IsKill=*/false);
   assert(QuotReg && "Unexpected DIV instruction emission failure.");
   // The remainder is computed as numerator - (quotient * denominator) using the
   // MSUB instruction.
   unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true,
                                         Src1Reg, Src1IsKill, Src0Reg,
                                         Src0IsKill);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectMul(const Instruction *I) {
   MVT VT;
   if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
     return false;
 
   if (VT.isVector())
     return selectBinaryOp(I, ISD::MUL);
 
   const Value *Src0 = I->getOperand(0);
   const Value *Src1 = I->getOperand(1);
   if (const auto *C = dyn_cast<ConstantInt>(Src0))
     if (C->getValue().isPowerOf2())
       std::swap(Src0, Src1);
 
   // Try to simplify to a shift instruction.
   if (const auto *C = dyn_cast<ConstantInt>(Src1))
     if (C->getValue().isPowerOf2()) {
       uint64_t ShiftVal = C->getValue().logBase2();
       MVT SrcVT = VT;
       bool IsZExt = true;
       if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) {
         if (!isIntExtFree(ZExt)) {
           MVT VT;
           if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) {
             SrcVT = VT;
             IsZExt = true;
             Src0 = ZExt->getOperand(0);
           }
         }
       } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) {
         if (!isIntExtFree(SExt)) {
           MVT VT;
           if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) {
             SrcVT = VT;
             IsZExt = false;
             Src0 = SExt->getOperand(0);
           }
         }
       }
 
       unsigned Src0Reg = getRegForValue(Src0);
       if (!Src0Reg)
         return false;
       bool Src0IsKill = hasTrivialKill(Src0);
 
       unsigned ResultReg =
           emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt);
 
       if (ResultReg) {
         updateValueMap(I, ResultReg);
         return true;
       }
     }
 
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
   bool Src0IsKill = hasTrivialKill(I->getOperand(0));
 
   unsigned Src1Reg = getRegForValue(I->getOperand(1));
   if (!Src1Reg)
     return false;
   bool Src1IsKill = hasTrivialKill(I->getOperand(1));
 
   unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
 
   if (!ResultReg)
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectShift(const Instruction *I) {
   MVT RetVT;
   if (!isTypeSupported(I->getType(), RetVT, /*IsVectorAllowed=*/true))
     return false;
 
   if (RetVT.isVector())
     return selectOperator(I, I->getOpcode());
 
   if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) {
     unsigned ResultReg = 0;
     uint64_t ShiftVal = C->getZExtValue();
     MVT SrcVT = RetVT;
     bool IsZExt = I->getOpcode() != Instruction::AShr;
     const Value *Op0 = I->getOperand(0);
     if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) {
       if (!isIntExtFree(ZExt)) {
         MVT TmpVT;
         if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) {
           SrcVT = TmpVT;
           IsZExt = true;
           Op0 = ZExt->getOperand(0);
         }
       }
     } else if (const auto *SExt = dyn_cast<SExtInst>(Op0)) {
       if (!isIntExtFree(SExt)) {
         MVT TmpVT;
         if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) {
           SrcVT = TmpVT;
           IsZExt = false;
           Op0 = SExt->getOperand(0);
         }
       }
     }
 
     unsigned Op0Reg = getRegForValue(Op0);
     if (!Op0Reg)
       return false;
     bool Op0IsKill = hasTrivialKill(Op0);
 
     switch (I->getOpcode()) {
     default: llvm_unreachable("Unexpected instruction.");
     case Instruction::Shl:
       ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
       break;
     case Instruction::AShr:
       ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
       break;
     case Instruction::LShr:
       ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
       break;
     }
     if (!ResultReg)
       return false;
 
     updateValueMap(I, ResultReg);
     return true;
   }
 
   unsigned Op0Reg = getRegForValue(I->getOperand(0));
   if (!Op0Reg)
     return false;
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
 
   unsigned Op1Reg = getRegForValue(I->getOperand(1));
   if (!Op1Reg)
     return false;
   bool Op1IsKill = hasTrivialKill(I->getOperand(1));
 
   unsigned ResultReg = 0;
   switch (I->getOpcode()) {
   default: llvm_unreachable("Unexpected instruction.");
   case Instruction::Shl:
     ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
     break;
   case Instruction::AShr:
     ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
     break;
   case Instruction::LShr:
     ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
     break;
   }
 
   if (!ResultReg)
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectBitCast(const Instruction *I) {
   MVT RetVT, SrcVT;
 
   if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT))
     return false;
   if (!isTypeLegal(I->getType(), RetVT))
     return false;
 
   unsigned Opc;
   if (RetVT == MVT::f32 && SrcVT == MVT::i32)
     Opc = AArch64::FMOVWSr;
   else if (RetVT == MVT::f64 && SrcVT == MVT::i64)
     Opc = AArch64::FMOVXDr;
   else if (RetVT == MVT::i32 && SrcVT == MVT::f32)
     Opc = AArch64::FMOVSWr;
   else if (RetVT == MVT::i64 && SrcVT == MVT::f64)
     Opc = AArch64::FMOVDXr;
   else
     return false;
 
   const TargetRegisterClass *RC = nullptr;
   switch (RetVT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type.");
   case MVT::i32: RC = &AArch64::GPR32RegClass; break;
   case MVT::i64: RC = &AArch64::GPR64RegClass; break;
   case MVT::f32: RC = &AArch64::FPR32RegClass; break;
   case MVT::f64: RC = &AArch64::FPR64RegClass; break;
   }
   unsigned Op0Reg = getRegForValue(I->getOperand(0));
   if (!Op0Reg)
     return false;
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
   unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill);
 
   if (!ResultReg)
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectFRem(const Instruction *I) {
   MVT RetVT;
   if (!isTypeLegal(I->getType(), RetVT))
     return false;
 
   RTLIB::Libcall LC;
   switch (RetVT.SimpleTy) {
   default:
     return false;
   case MVT::f32:
     LC = RTLIB::REM_F32;
     break;
   case MVT::f64:
     LC = RTLIB::REM_F64;
     break;
   }
 
   ArgListTy Args;
   Args.reserve(I->getNumOperands());
 
   // Populate the argument list.
   for (auto &Arg : I->operands()) {
     ArgListEntry Entry;
     Entry.Val = Arg;
     Entry.Ty = Arg->getType();
     Args.push_back(Entry);
   }
 
   CallLoweringInfo CLI;
   MCContext &Ctx = MF->getContext();
   CLI.setCallee(DL, Ctx, TLI.getLibcallCallingConv(LC), I->getType(),
                 TLI.getLibcallName(LC), std::move(Args));
   if (!lowerCallTo(CLI))
     return false;
   updateValueMap(I, CLI.ResultReg);
   return true;
 }
 
 bool AArch64FastISel::selectSDiv(const Instruction *I) {
   MVT VT;
   if (!isTypeLegal(I->getType(), VT))
     return false;
 
   if (!isa<ConstantInt>(I->getOperand(1)))
     return selectBinaryOp(I, ISD::SDIV);
 
   const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue();
   if ((VT != MVT::i32 && VT != MVT::i64) || !C ||
       !(C.isPowerOf2() || (-C).isPowerOf2()))
     return selectBinaryOp(I, ISD::SDIV);
 
   unsigned Lg2 = C.countTrailingZeros();
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
   bool Src0IsKill = hasTrivialKill(I->getOperand(0));
 
   if (cast<BinaryOperator>(I)->isExact()) {
     unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2);
     if (!ResultReg)
       return false;
     updateValueMap(I, ResultReg);
     return true;
   }
 
   int64_t Pow2MinusOne = (1ULL << Lg2) - 1;
   unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne);
   if (!AddReg)
     return false;
 
   // (Src0 < 0) ? Pow2 - 1 : 0;
   if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0))
     return false;
 
   unsigned SelectOpc;
   const TargetRegisterClass *RC;
   if (VT == MVT::i64) {
     SelectOpc = AArch64::CSELXr;
     RC = &AArch64::GPR64RegClass;
   } else {
     SelectOpc = AArch64::CSELWr;
     RC = &AArch64::GPR32RegClass;
   }
   unsigned SelectReg =
       fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg,
                        Src0IsKill, AArch64CC::LT);
   if (!SelectReg)
     return false;
 
   // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also
   // negate the result.
   unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
   unsigned ResultReg;
   if (C.isNegative())
     ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true,
                               SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2);
   else
     ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2);
 
   if (!ResultReg)
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 /// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We
 /// have to duplicate it for AArch64, because otherwise we would fail during the
 /// sign-extend emission.
 std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
   unsigned IdxN = getRegForValue(Idx);
   if (IdxN == 0)
     // Unhandled operand. Halt "fast" selection and bail.
     return std::pair<unsigned, bool>(0, false);
 
   bool IdxNIsKill = hasTrivialKill(Idx);
 
   // If the index is smaller or larger than intptr_t, truncate or extend it.
   MVT PtrVT = TLI.getPointerTy(DL);
   EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
   if (IdxVT.bitsLT(PtrVT)) {
     IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
     IdxNIsKill = true;
   } else if (IdxVT.bitsGT(PtrVT))
     llvm_unreachable("AArch64 FastISel doesn't support types larger than i64");
   return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
 }
 
 /// This is mostly a copy of the existing FastISel GEP code, but we have to
 /// duplicate it for AArch64, because otherwise we would bail out even for
 /// simple cases. This is because the standard fastEmit functions don't cover
 /// MUL at all and ADD is lowered very inefficientily.
 bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
   unsigned N = getRegForValue(I->getOperand(0));
   if (!N)
     return false;
   bool NIsKill = hasTrivialKill(I->getOperand(0));
 
   // Keep a running tab of the total offset to coalesce multiple N = N + Offset
   // into a single N = N + TotalOffset.
   uint64_t TotalOffs = 0;
   MVT VT = TLI.getPointerTy(DL);
   for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
        GTI != E; ++GTI) {
     const Value *Idx = GTI.getOperand();
     if (auto *StTy = GTI.getStructTypeOrNull()) {
       unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
       // N = N + Offset
       if (Field)
         TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
     } else {
       Type *Ty = GTI.getIndexedType();
 
       // If this is a constant subscript, handle it quickly.
       if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero())
           continue;
         // N = N + Offset
         TotalOffs +=
             DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
         continue;
       }
       if (TotalOffs) {
         N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
         if (!N)
           return false;
         NIsKill = true;
         TotalOffs = 0;
       }
 
       // N = N + Idx * ElementSize;
       uint64_t ElementSize = DL.getTypeAllocSize(Ty);
       std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
       unsigned IdxN = Pair.first;
       bool IdxNIsKill = Pair.second;
       if (!IdxN)
         return false;
 
       if (ElementSize != 1) {
         unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize);
         if (!C)
           return false;
         IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true);
         if (!IdxN)
           return false;
         IdxNIsKill = true;
       }
       N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
       if (!N)
         return false;
     }
   }
   if (TotalOffs) {
     N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
     if (!N)
       return false;
   }
   updateValueMap(I, N);
   return true;
 }
 
 bool AArch64FastISel::selectAtomicCmpXchg(const AtomicCmpXchgInst *I) {
   assert(TM.getOptLevel() == CodeGenOpt::None &&
          "cmpxchg survived AtomicExpand at optlevel > -O0");
 
   auto *RetPairTy = cast<StructType>(I->getType());
   Type *RetTy = RetPairTy->getTypeAtIndex(0U);
   assert(RetPairTy->getTypeAtIndex(1U)->isIntegerTy(1) &&
          "cmpxchg has a non-i1 status result");
 
   MVT VT;
   if (!isTypeLegal(RetTy, VT))
     return false;
 
   const TargetRegisterClass *ResRC;
   unsigned Opc, CmpOpc;
   // This only supports i32/i64, because i8/i16 aren't legal, and the generic
   // extractvalue selection doesn't support that.
   if (VT == MVT::i32) {
     Opc = AArch64::CMP_SWAP_32;
     CmpOpc = AArch64::SUBSWrs;
     ResRC = &AArch64::GPR32RegClass;
   } else if (VT == MVT::i64) {
     Opc = AArch64::CMP_SWAP_64;
     CmpOpc = AArch64::SUBSXrs;
     ResRC = &AArch64::GPR64RegClass;
   } else {
     return false;
   }
 
   const MCInstrDesc &II = TII.get(Opc);
 
   const unsigned AddrReg = constrainOperandRegClass(
       II, getRegForValue(I->getPointerOperand()), II.getNumDefs());
   const unsigned DesiredReg = constrainOperandRegClass(
       II, getRegForValue(I->getCompareOperand()), II.getNumDefs() + 1);
   const unsigned NewReg = constrainOperandRegClass(
       II, getRegForValue(I->getNewValOperand()), II.getNumDefs() + 2);
 
   const unsigned ResultReg1 = createResultReg(ResRC);
   const unsigned ResultReg2 = createResultReg(&AArch64::GPR32RegClass);
   const unsigned ScratchReg = createResultReg(&AArch64::GPR32RegClass);
 
   // FIXME: MachineMemOperand doesn't support cmpxchg yet.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
       .addDef(ResultReg1)
       .addDef(ScratchReg)
       .addUse(AddrReg)
       .addUse(DesiredReg)
       .addUse(NewReg);
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
       .addDef(VT == MVT::i32 ? AArch64::WZR : AArch64::XZR)
       .addUse(ResultReg1)
       .addUse(DesiredReg)
       .addImm(0);
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr))
       .addDef(ResultReg2)
       .addUse(AArch64::WZR)
       .addUse(AArch64::WZR)
       .addImm(AArch64CC::NE);
 
   assert((ResultReg1 + 1) == ResultReg2 && "Nonconsecutive result registers.");
   updateValueMap(I, ResultReg1, 2);
   return true;
 }
 
 bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
   default:
     break;
   case Instruction::Add:
   case Instruction::Sub:
     return selectAddSub(I);
   case Instruction::Mul:
     return selectMul(I);
   case Instruction::SDiv:
     return selectSDiv(I);
   case Instruction::SRem:
     if (!selectBinaryOp(I, ISD::SREM))
       return selectRem(I, ISD::SREM);
     return true;
   case Instruction::URem:
     if (!selectBinaryOp(I, ISD::UREM))
       return selectRem(I, ISD::UREM);
     return true;
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
     return selectShift(I);
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
     return selectLogicalOp(I);
   case Instruction::Br:
     return selectBranch(I);
   case Instruction::IndirectBr:
     return selectIndirectBr(I);
   case Instruction::BitCast:
     if (!FastISel::selectBitCast(I))
       return selectBitCast(I);
     return true;
   case Instruction::FPToSI:
     if (!selectCast(I, ISD::FP_TO_SINT))
       return selectFPToInt(I, /*Signed=*/true);
     return true;
   case Instruction::FPToUI:
     return selectFPToInt(I, /*Signed=*/false);
   case Instruction::ZExt:
   case Instruction::SExt:
     return selectIntExt(I);
   case Instruction::Trunc:
     if (!selectCast(I, ISD::TRUNCATE))
       return selectTrunc(I);
     return true;
   case Instruction::FPExt:
     return selectFPExt(I);
   case Instruction::FPTrunc:
     return selectFPTrunc(I);
   case Instruction::SIToFP:
     if (!selectCast(I, ISD::SINT_TO_FP))
       return selectIntToFP(I, /*Signed=*/true);
     return true;
   case Instruction::UIToFP:
     return selectIntToFP(I, /*Signed=*/false);
   case Instruction::Load:
     return selectLoad(I);
   case Instruction::Store:
     return selectStore(I);
   case Instruction::FCmp:
   case Instruction::ICmp:
     return selectCmp(I);
   case Instruction::Select:
     return selectSelect(I);
   case Instruction::Ret:
     return selectRet(I);
   case Instruction::FRem:
     return selectFRem(I);
   case Instruction::GetElementPtr:
     return selectGetElementPtr(I);
   case Instruction::AtomicCmpXchg:
     return selectAtomicCmpXchg(cast<AtomicCmpXchgInst>(I));
   }
 
   // Silence warnings.
   (void)&CC_AArch64_DarwinPCS_VarArg;
   (void)&CC_AArch64_Win64_VarArg;
 
   // fall-back to target-independent instruction selection.
   return selectOperator(I, I->getOpcode());
 }
 
 namespace llvm {
 
 FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
                                         const TargetLibraryInfo *LibInfo) {
   return new AArch64FastISel(FuncInfo, LibInfo);
 }
 
 } // end namespace llvm
Index: head/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp	(revision 328753)
@@ -1,1566 +1,1572 @@
 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 /// \file
 /// This file implements the targeting of the InstructionSelector class for
 /// AArch64.
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterBankInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "aarch64-isel"
 
 using namespace llvm;
 
 namespace {
 
 #define GET_GLOBALISEL_PREDICATE_BITSET
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATE_BITSET
 
 class AArch64InstructionSelector : public InstructionSelector {
 public:
   AArch64InstructionSelector(const AArch64TargetMachine &TM,
                              const AArch64Subtarget &STI,
                              const AArch64RegisterBankInfo &RBI);
 
   bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
   static const char *getName() { return DEBUG_TYPE; }
 
 private:
   /// tblgen-erated 'select' implementation, used as the initial selector for
   /// the patterns that don't require complex C++.
   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
 
   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
                           MachineRegisterInfo &MRI) const;
   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
                            MachineRegisterInfo &MRI) const;
 
   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
                            MachineRegisterInfo &MRI) const;
 
   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
 
   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
                                             unsigned Size) const;
 
   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
     return selectAddrModeUnscaled(Root, 1);
   }
   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
     return selectAddrModeUnscaled(Root, 2);
   }
   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
     return selectAddrModeUnscaled(Root, 4);
   }
   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
     return selectAddrModeUnscaled(Root, 8);
   }
   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
     return selectAddrModeUnscaled(Root, 16);
   }
 
   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
                                            unsigned Size) const;
   template <int Width>
   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
     return selectAddrModeIndexed(Root, Width / 8);
   }
 
   const AArch64TargetMachine &TM;
   const AArch64Subtarget &STI;
   const AArch64InstrInfo &TII;
   const AArch64RegisterInfo &TRI;
   const AArch64RegisterBankInfo &RBI;
 
 #define GET_GLOBALISEL_PREDICATES_DECL
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_DECL
 
 // We declare the temporaries used by selectImpl() in the class to minimize the
 // cost of constructing placeholder values.
 #define GET_GLOBALISEL_TEMPORARIES_DECL
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_TEMPORARIES_DECL
 };
 
 } // end anonymous namespace
 
 #define GET_GLOBALISEL_IMPL
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_IMPL
 
 AArch64InstructionSelector::AArch64InstructionSelector(
     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
     const AArch64RegisterBankInfo &RBI)
     : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
       TRI(*STI.getRegisterInfo()), RBI(RBI),
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_INIT
 #define GET_GLOBALISEL_TEMPORARIES_INIT
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_TEMPORARIES_INIT
 {
 }
 
 // FIXME: This should be target-independent, inferred from the types declared
 // for each class in the bank.
 static const TargetRegisterClass *
 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
                          const RegisterBankInfo &RBI) {
   if (RB.getID() == AArch64::GPRRegBankID) {
     if (Ty.getSizeInBits() <= 32)
       return &AArch64::GPR32RegClass;
     if (Ty.getSizeInBits() == 64)
       return &AArch64::GPR64RegClass;
     return nullptr;
   }
 
   if (RB.getID() == AArch64::FPRRegBankID) {
     if (Ty.getSizeInBits() == 32)
       return &AArch64::FPR32RegClass;
     if (Ty.getSizeInBits() == 64)
       return &AArch64::FPR64RegClass;
     if (Ty.getSizeInBits() == 128)
       return &AArch64::FPR128RegClass;
     return nullptr;
   }
 
   return nullptr;
 }
 
 /// Check whether \p I is a currently unsupported binary operation:
 /// - it has an unsized type
 /// - an operand is not a vreg
 /// - all operands are not in the same bank
 /// These are checks that should someday live in the verifier, but right now,
 /// these are mostly limitations of the aarch64 selector.
 static bool unsupportedBinOp(const MachineInstr &I,
                              const AArch64RegisterBankInfo &RBI,
                              const MachineRegisterInfo &MRI,
                              const AArch64RegisterInfo &TRI) {
   LLT Ty = MRI.getType(I.getOperand(0).getReg());
   if (!Ty.isValid()) {
     DEBUG(dbgs() << "Generic binop register should be typed\n");
     return true;
   }
 
   const RegisterBank *PrevOpBank = nullptr;
   for (auto &MO : I.operands()) {
     // FIXME: Support non-register operands.
     if (!MO.isReg()) {
       DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
       return true;
     }
 
     // FIXME: Can generic operations have physical registers operands? If
     // so, this will need to be taught about that, and we'll need to get the
     // bank out of the minimal class for the register.
     // Either way, this needs to be documented (and possibly verified).
     if (!TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
       DEBUG(dbgs() << "Generic inst has physical register operand\n");
       return true;
     }
 
     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
     if (!OpBank) {
       DEBUG(dbgs() << "Generic register has no bank or class\n");
       return true;
     }
 
     if (PrevOpBank && OpBank != PrevOpBank) {
       DEBUG(dbgs() << "Generic inst operands have different banks\n");
       return true;
     }
     PrevOpBank = OpBank;
   }
   return false;
 }
 
 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
 /// and of size \p OpSize.
 /// \returns \p GenericOpc if the combination is unsupported.
 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
                                unsigned OpSize) {
   switch (RegBankID) {
   case AArch64::GPRRegBankID:
     if (OpSize == 32) {
       switch (GenericOpc) {
       case TargetOpcode::G_SHL:
         return AArch64::LSLVWr;
       case TargetOpcode::G_LSHR:
         return AArch64::LSRVWr;
       case TargetOpcode::G_ASHR:
         return AArch64::ASRVWr;
       default:
         return GenericOpc;
       }
     } else if (OpSize == 64) {
       switch (GenericOpc) {
       case TargetOpcode::G_GEP:
         return AArch64::ADDXrr;
       case TargetOpcode::G_SHL:
         return AArch64::LSLVXr;
       case TargetOpcode::G_LSHR:
         return AArch64::LSRVXr;
       case TargetOpcode::G_ASHR:
         return AArch64::ASRVXr;
       default:
         return GenericOpc;
       }
     }
     break;
   case AArch64::FPRRegBankID:
     switch (OpSize) {
     case 32:
       switch (GenericOpc) {
       case TargetOpcode::G_FADD:
         return AArch64::FADDSrr;
       case TargetOpcode::G_FSUB:
         return AArch64::FSUBSrr;
       case TargetOpcode::G_FMUL:
         return AArch64::FMULSrr;
       case TargetOpcode::G_FDIV:
         return AArch64::FDIVSrr;
       default:
         return GenericOpc;
       }
     case 64:
       switch (GenericOpc) {
       case TargetOpcode::G_FADD:
         return AArch64::FADDDrr;
       case TargetOpcode::G_FSUB:
         return AArch64::FSUBDrr;
       case TargetOpcode::G_FMUL:
         return AArch64::FMULDrr;
       case TargetOpcode::G_FDIV:
         return AArch64::FDIVDrr;
       case TargetOpcode::G_OR:
         return AArch64::ORRv8i8;
       default:
         return GenericOpc;
       }
     }
     break;
   }
   return GenericOpc;
 }
 
 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
 /// appropriate for the (value) register bank \p RegBankID and of memory access
 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
 /// addressing mode (e.g., LDRXui).
 /// \returns \p GenericOpc if the combination is unsupported.
 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
                                     unsigned OpSize) {
   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
   switch (RegBankID) {
   case AArch64::GPRRegBankID:
     switch (OpSize) {
     case 8:
       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
     case 16:
       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
     case 32:
       return isStore ? AArch64::STRWui : AArch64::LDRWui;
     case 64:
       return isStore ? AArch64::STRXui : AArch64::LDRXui;
     }
     break;
   case AArch64::FPRRegBankID:
     switch (OpSize) {
     case 8:
       return isStore ? AArch64::STRBui : AArch64::LDRBui;
     case 16:
       return isStore ? AArch64::STRHui : AArch64::LDRHui;
     case 32:
       return isStore ? AArch64::STRSui : AArch64::LDRSui;
     case 64:
       return isStore ? AArch64::STRDui : AArch64::LDRDui;
     }
     break;
   }
   return GenericOpc;
 }
 
 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                        const RegisterBankInfo &RBI) {
 
   unsigned DstReg = I.getOperand(0).getReg();
   if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
     assert(I.isCopy() && "Generic operators do not allow physical registers");
     return true;
   }
 
   const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
   const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
   unsigned SrcReg = I.getOperand(1).getReg();
   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
   (void)SrcSize;
   assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
          "No phys reg on generic operators");
   assert(
       (DstSize == SrcSize ||
        // Copies are a mean to setup initial types, the number of
        // bits may not exactly match.
        (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
         DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI)) ||
        // Copies are a mean to copy bits around, as long as we are
        // on the same register class, that's fine. Otherwise, that
        // means we need some SUBREG_TO_REG or AND & co.
        (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
       "Copy with different width?!");
   assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) &&
          "GPRs cannot get more than 64-bit width values");
   const TargetRegisterClass *RC = nullptr;
 
   if (RegBank.getID() == AArch64::FPRRegBankID) {
     if (DstSize <= 16)
       RC = &AArch64::FPR16RegClass;
     else if (DstSize <= 32)
       RC = &AArch64::FPR32RegClass;
     else if (DstSize <= 64)
       RC = &AArch64::FPR64RegClass;
     else if (DstSize <= 128)
       RC = &AArch64::FPR128RegClass;
     else {
       DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
       return false;
     }
   } else {
     assert(RegBank.getID() == AArch64::GPRRegBankID &&
            "Bitcast for the flags?");
     RC =
         DstSize <= 32 ? &AArch64::GPR32allRegClass : &AArch64::GPR64allRegClass;
   }
 
   // No need to constrain SrcReg. It will get constrained when
   // we hit another of its use or its defs.
   // Copies do not have constraints.
   if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
     DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
                  << " operand\n");
     return false;
   }
   I.setDesc(TII.get(AArch64::COPY));
   return true;
 }
 
 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
   if (!DstTy.isScalar() || !SrcTy.isScalar())
     return GenericOpc;
 
   const unsigned DstSize = DstTy.getSizeInBits();
   const unsigned SrcSize = SrcTy.getSizeInBits();
 
   switch (DstSize) {
   case 32:
     switch (SrcSize) {
     case 32:
       switch (GenericOpc) {
       case TargetOpcode::G_SITOFP:
         return AArch64::SCVTFUWSri;
       case TargetOpcode::G_UITOFP:
         return AArch64::UCVTFUWSri;
       case TargetOpcode::G_FPTOSI:
         return AArch64::FCVTZSUWSr;
       case TargetOpcode::G_FPTOUI:
         return AArch64::FCVTZUUWSr;
       default:
         return GenericOpc;
       }
     case 64:
       switch (GenericOpc) {
       case TargetOpcode::G_SITOFP:
         return AArch64::SCVTFUXSri;
       case TargetOpcode::G_UITOFP:
         return AArch64::UCVTFUXSri;
       case TargetOpcode::G_FPTOSI:
         return AArch64::FCVTZSUWDr;
       case TargetOpcode::G_FPTOUI:
         return AArch64::FCVTZUUWDr;
       default:
         return GenericOpc;
       }
     default:
       return GenericOpc;
     }
   case 64:
     switch (SrcSize) {
     case 32:
       switch (GenericOpc) {
       case TargetOpcode::G_SITOFP:
         return AArch64::SCVTFUWDri;
       case TargetOpcode::G_UITOFP:
         return AArch64::UCVTFUWDri;
       case TargetOpcode::G_FPTOSI:
         return AArch64::FCVTZSUXSr;
       case TargetOpcode::G_FPTOUI:
         return AArch64::FCVTZUUXSr;
       default:
         return GenericOpc;
       }
     case 64:
       switch (GenericOpc) {
       case TargetOpcode::G_SITOFP:
         return AArch64::SCVTFUXDri;
       case TargetOpcode::G_UITOFP:
         return AArch64::UCVTFUXDri;
       case TargetOpcode::G_FPTOSI:
         return AArch64::FCVTZSUXDr;
       case TargetOpcode::G_FPTOUI:
         return AArch64::FCVTZUUXDr;
       default:
         return GenericOpc;
       }
     default:
       return GenericOpc;
     }
   default:
     return GenericOpc;
   };
   return GenericOpc;
 }
 
 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
   switch (P) {
   default:
     llvm_unreachable("Unknown condition code!");
   case CmpInst::ICMP_NE:
     return AArch64CC::NE;
   case CmpInst::ICMP_EQ:
     return AArch64CC::EQ;
   case CmpInst::ICMP_SGT:
     return AArch64CC::GT;
   case CmpInst::ICMP_SGE:
     return AArch64CC::GE;
   case CmpInst::ICMP_SLT:
     return AArch64CC::LT;
   case CmpInst::ICMP_SLE:
     return AArch64CC::LE;
   case CmpInst::ICMP_UGT:
     return AArch64CC::HI;
   case CmpInst::ICMP_UGE:
     return AArch64CC::HS;
   case CmpInst::ICMP_ULT:
     return AArch64CC::LO;
   case CmpInst::ICMP_ULE:
     return AArch64CC::LS;
   }
 }
 
 static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
                                       AArch64CC::CondCode &CondCode,
                                       AArch64CC::CondCode &CondCode2) {
   CondCode2 = AArch64CC::AL;
   switch (P) {
   default:
     llvm_unreachable("Unknown FP condition!");
   case CmpInst::FCMP_OEQ:
     CondCode = AArch64CC::EQ;
     break;
   case CmpInst::FCMP_OGT:
     CondCode = AArch64CC::GT;
     break;
   case CmpInst::FCMP_OGE:
     CondCode = AArch64CC::GE;
     break;
   case CmpInst::FCMP_OLT:
     CondCode = AArch64CC::MI;
     break;
   case CmpInst::FCMP_OLE:
     CondCode = AArch64CC::LS;
     break;
   case CmpInst::FCMP_ONE:
     CondCode = AArch64CC::MI;
     CondCode2 = AArch64CC::GT;
     break;
   case CmpInst::FCMP_ORD:
     CondCode = AArch64CC::VC;
     break;
   case CmpInst::FCMP_UNO:
     CondCode = AArch64CC::VS;
     break;
   case CmpInst::FCMP_UEQ:
     CondCode = AArch64CC::EQ;
     CondCode2 = AArch64CC::VS;
     break;
   case CmpInst::FCMP_UGT:
     CondCode = AArch64CC::HI;
     break;
   case CmpInst::FCMP_UGE:
     CondCode = AArch64CC::PL;
     break;
   case CmpInst::FCMP_ULT:
     CondCode = AArch64CC::LT;
     break;
   case CmpInst::FCMP_ULE:
     CondCode = AArch64CC::LE;
     break;
   case CmpInst::FCMP_UNE:
     CondCode = AArch64CC::NE;
     break;
   }
 }
 
 bool AArch64InstructionSelector::selectCompareBranch(
     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
 
   const unsigned CondReg = I.getOperand(0).getReg();
   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
   if (CCMI->getOpcode() == TargetOpcode::G_TRUNC)
     CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg());
   if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
     return false;
 
   unsigned LHS = CCMI->getOperand(2).getReg();
   unsigned RHS = CCMI->getOperand(3).getReg();
   if (!getConstantVRegVal(RHS, MRI))
     std::swap(RHS, LHS);
 
   const auto RHSImm = getConstantVRegVal(RHS, MRI);
   if (!RHSImm || *RHSImm != 0)
     return false;
 
   const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
   if (RB.getID() != AArch64::GPRRegBankID)
     return false;
 
   const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
   if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
     return false;
 
   const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
   unsigned CBOpc = 0;
   if (CmpWidth <= 32)
     CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
   else if (CmpWidth == 64)
     CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
   else
     return false;
 
   auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
                  .addUse(LHS)
                  .addMBB(DestMBB);
 
   constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
   I.eraseFromParent();
   return true;
 }
 
 bool AArch64InstructionSelector::selectVaStartAAPCS(
     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
   return false;
 }
 
 bool AArch64InstructionSelector::selectVaStartDarwin(
     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   unsigned ListReg = I.getOperand(0).getReg();
 
   unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
 
   auto MIB =
       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
           .addDef(ArgsAddrReg)
           .addFrameIndex(FuncInfo->getVarArgsStackIndex())
           .addImm(0)
           .addImm(0);
 
   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 
   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
             .addUse(ArgsAddrReg)
             .addUse(ListReg)
             .addImm(0)
             .addMemOperand(*I.memoperands_begin());
 
   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
   I.eraseFromParent();
   return true;
 }
 
 bool AArch64InstructionSelector::select(MachineInstr &I,
                                         CodeGenCoverage &CoverageInfo) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
 
   MachineBasicBlock &MBB = *I.getParent();
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   unsigned Opcode = I.getOpcode();
   // G_PHI requires same handling as PHI
   if (!isPreISelGenericOpcode(Opcode) || Opcode == TargetOpcode::G_PHI) {
     // Certain non-generic instructions also need some special handling.
 
     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 
     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
       const unsigned DefReg = I.getOperand(0).getReg();
       const LLT DefTy = MRI.getType(DefReg);
 
       const TargetRegisterClass *DefRC = nullptr;
       if (TargetRegisterInfo::isPhysicalRegister(DefReg)) {
         DefRC = TRI.getRegClass(DefReg);
       } else {
         const RegClassOrRegBank &RegClassOrBank =
             MRI.getRegClassOrRegBank(DefReg);
 
         DefRC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
         if (!DefRC) {
           if (!DefTy.isValid()) {
             DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
             return false;
           }
           const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
           DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
           if (!DefRC) {
             DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
             return false;
           }
         }
       }
       I.setDesc(TII.get(TargetOpcode::PHI));
 
       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
     }
 
     if (I.isCopy())
       return selectCopy(I, TII, MRI, TRI, RBI);
 
     return true;
   }
 
 
   if (I.getNumOperands() != I.getNumExplicitOperands()) {
     DEBUG(dbgs() << "Generic instruction has unexpected implicit operands\n");
     return false;
   }
 
   if (selectImpl(I, CoverageInfo))
     return true;
 
   LLT Ty =
       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
 
   switch (Opcode) {
   case TargetOpcode::G_BRCOND: {
     if (Ty.getSizeInBits() > 32) {
       // We shouldn't need this on AArch64, but it would be implemented as an
       // EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
       // bit being tested is < 32.
       DEBUG(dbgs() << "G_BRCOND has type: " << Ty
                    << ", expected at most 32-bits");
       return false;
     }
 
     const unsigned CondReg = I.getOperand(0).getReg();
     MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
 
     if (selectCompareBranch(I, MF, MRI))
       return true;
 
     auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
                    .addUse(CondReg)
                    .addImm(/*bit offset=*/0)
                    .addMBB(DestMBB);
 
     I.eraseFromParent();
     return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
   }
 
   case TargetOpcode::G_BRINDIRECT: {
     I.setDesc(TII.get(AArch64::BR));
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
   case TargetOpcode::G_FCONSTANT:
   case TargetOpcode::G_CONSTANT: {
     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
 
     const LLT s32 = LLT::scalar(32);
     const LLT s64 = LLT::scalar(64);
     const LLT p0 = LLT::pointer(0, 64);
 
     const unsigned DefReg = I.getOperand(0).getReg();
     const LLT DefTy = MRI.getType(DefReg);
     const unsigned DefSize = DefTy.getSizeInBits();
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     // FIXME: Redundant check, but even less readable when factored out.
     if (isFP) {
       if (Ty != s32 && Ty != s64) {
         DEBUG(dbgs() << "Unable to materialize FP " << Ty
                      << " constant, expected: " << s32 << " or " << s64
                      << '\n');
         return false;
       }
 
       if (RB.getID() != AArch64::FPRRegBankID) {
         DEBUG(dbgs() << "Unable to materialize FP " << Ty
                      << " constant on bank: " << RB << ", expected: FPR\n");
         return false;
       }
 
       // The case when we have 0.0 is covered by tablegen. Reject it here so we
       // can be sure tablegen works correctly and isn't rescued by this code.
       if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0))
         return false;
     } else {
       // s32 and s64 are covered by tablegen.
       if (Ty != p0) {
         DEBUG(dbgs() << "Unable to materialize integer " << Ty
                      << " constant, expected: " << s32 << ", " << s64 << ", or "
                      << p0 << '\n');
         return false;
       }
 
       if (RB.getID() != AArch64::GPRRegBankID) {
         DEBUG(dbgs() << "Unable to materialize integer " << Ty
                      << " constant on bank: " << RB << ", expected: GPR\n");
         return false;
       }
     }
 
     const unsigned MovOpc =
         DefSize == 32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
 
     I.setDesc(TII.get(MovOpc));
 
     if (isFP) {
       const TargetRegisterClass &GPRRC =
           DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
       const TargetRegisterClass &FPRRC =
           DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;
 
       const unsigned DefGPRReg = MRI.createVirtualRegister(&GPRRC);
       MachineOperand &RegOp = I.getOperand(0);
       RegOp.setReg(DefGPRReg);
 
       BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
               TII.get(AArch64::COPY))
           .addDef(DefReg)
           .addUse(DefGPRReg);
 
       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
         DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
         return false;
       }
 
       MachineOperand &ImmOp = I.getOperand(1);
       // FIXME: Is going through int64_t always correct?
       ImmOp.ChangeToImmediate(
           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
     } else if (I.getOperand(1).isCImm()) {
       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
       I.getOperand(1).ChangeToImmediate(Val);
     } else if (I.getOperand(1).isImm()) {
       uint64_t Val = I.getOperand(1).getImm();
       I.getOperand(1).ChangeToImmediate(Val);
     }
 
     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
     return true;
   }
   case TargetOpcode::G_EXTRACT: {
     LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
     // Larger extracts are vectors, same-size extracts should be something else
     // by now (either split up or simplified to a COPY).
     if (SrcTy.getSizeInBits() > 64 || Ty.getSizeInBits() > 32)
       return false;
 
     I.setDesc(TII.get(AArch64::UBFMXri));
     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
                                       Ty.getSizeInBits() - 1);
 
     unsigned DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
     BuildMI(MBB, std::next(I.getIterator()), I.getDebugLoc(),
             TII.get(AArch64::COPY))
         .addDef(I.getOperand(0).getReg())
         .addUse(DstReg, 0, AArch64::sub_32);
     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
                                  AArch64::GPR32RegClass, MRI);
     I.getOperand(0).setReg(DstReg);
 
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
   case TargetOpcode::G_INSERT: {
     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
     // Larger inserts are vectors, same-size ones should be something else by
     // now (split up or turned into COPYs).
     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
       return false;
 
     I.setDesc(TII.get(AArch64::BFMXri));
     unsigned LSB = I.getOperand(3).getImm();
     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
     I.getOperand(3).setImm((64 - LSB) % 64);
     MachineInstrBuilder(MF, I).addImm(Width - 1);
 
     unsigned SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
             TII.get(AArch64::SUBREG_TO_REG))
         .addDef(SrcReg)
         .addImm(0)
         .addUse(I.getOperand(2).getReg())
         .addImm(AArch64::sub_32);
     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
                                  AArch64::GPR32RegClass, MRI);
     I.getOperand(2).setReg(SrcReg);
 
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
   case TargetOpcode::G_FRAME_INDEX: {
     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
     if (Ty != LLT::pointer(0, 64)) {
       DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
             << ", expected: " << LLT::pointer(0, 64) << '\n');
       return false;
     }
     I.setDesc(TII.get(AArch64::ADDXri));
 
     // MOs for a #0 shifted immediate.
     I.addOperand(MachineOperand::CreateImm(0));
     I.addOperand(MachineOperand::CreateImm(0));
 
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
   case TargetOpcode::G_GLOBAL_VALUE: {
     auto GV = I.getOperand(1).getGlobal();
     if (GV->isThreadLocal()) {
       // FIXME: we don't support TLS yet.
       return false;
     }
     unsigned char OpFlags = STI.ClassifyGlobalReference(GV, TM);
     if (OpFlags & AArch64II::MO_GOT) {
       I.setDesc(TII.get(AArch64::LOADgot));
       I.getOperand(1).setTargetFlags(OpFlags);
     } else if (TM.getCodeModel() == CodeModel::Large) {
       // Materialize the global using movz/movk instructions.
       unsigned MovZDstReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
       auto InsertPt = std::next(I.getIterator());
       auto MovZ =
           BuildMI(MBB, InsertPt, I.getDebugLoc(), TII.get(AArch64::MOVZXi))
               .addDef(MovZDstReg);
       MovZ->addOperand(MF, I.getOperand(1));
       MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
                                          AArch64II::MO_NC);
       MovZ->addOperand(MF, MachineOperand::CreateImm(0));
       constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
 
       auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags,
                            unsigned Offset, unsigned ForceDstReg) {
         unsigned DstReg =
             ForceDstReg ? ForceDstReg
                         : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
         auto MovI = BuildMI(MBB, InsertPt, MovZ->getDebugLoc(),
                             TII.get(AArch64::MOVKXi))
                         .addDef(DstReg)
                         .addReg(SrcReg);
         MovI->addOperand(MF, MachineOperand::CreateGA(
                                  GV, MovZ->getOperand(1).getOffset(), Flags));
         MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
         constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
         return DstReg;
       };
       unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(),
                                   AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
       DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
       BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
       I.eraseFromParent();
       return true;
     } else {
       I.setDesc(TII.get(AArch64::MOVaddr));
       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
       MachineInstrBuilder MIB(MF, I);
       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
     }
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE: {
     LLT MemTy = Ty;
     LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
 
     if (PtrTy != LLT::pointer(0, 64)) {
       DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
                    << ", expected: " << LLT::pointer(0, 64) << '\n');
       return false;
     }
 
     auto &MemOp = **I.memoperands_begin();
     if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
       DEBUG(dbgs() << "Atomic load/store not supported yet\n");
       return false;
     }
 
+    // FIXME: PR36018: Volatile loads in some cases are incorrectly selected by
+    // folding with an extend. Until we have a G_SEXTLOAD solution bail out if
+    // we hit one.
+    if (Opcode == TargetOpcode::G_LOAD && MemOp.isVolatile())
+      return false;
+
     const unsigned PtrReg = I.getOperand(1).getReg();
 #ifndef NDEBUG
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
     // Sanity-check the pointer register.
     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
            "Load/Store pointer operand isn't a GPR");
     assert(MRI.getType(PtrReg).isPointer() &&
            "Load/Store pointer operand isn't a pointer");
 #endif
 
     const unsigned ValReg = I.getOperand(0).getReg();
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
 
     const unsigned NewOpc =
         selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemTy.getSizeInBits());
     if (NewOpc == I.getOpcode())
       return false;
 
     I.setDesc(TII.get(NewOpc));
 
     uint64_t Offset = 0;
     auto *PtrMI = MRI.getVRegDef(PtrReg);
 
     // Try to fold a GEP into our unsigned immediate addressing mode.
     if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
       if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
         int64_t Imm = *COff;
         const unsigned Size = MemTy.getSizeInBits() / 8;
         const unsigned Scale = Log2_32(Size);
         if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
           unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
           I.getOperand(1).setReg(Ptr2Reg);
           PtrMI = MRI.getVRegDef(Ptr2Reg);
           Offset = Imm / Size;
         }
       }
     }
 
     // If we haven't folded anything into our addressing mode yet, try to fold
     // a frame index into the base+offset.
     if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
       I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());
 
     I.addOperand(MachineOperand::CreateImm(Offset));
 
     // If we're storing a 0, use WZR/XZR.
     if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
       if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
         if (I.getOpcode() == AArch64::STRWui)
           I.getOperand(0).setReg(AArch64::WZR);
         else if (I.getOpcode() == AArch64::STRXui)
           I.getOperand(0).setReg(AArch64::XZR);
       }
     }
 
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
   case TargetOpcode::G_SMULH:
   case TargetOpcode::G_UMULH: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
       return false;
 
     const unsigned DefReg = I.getOperand(0).getReg();
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     if (RB.getID() != AArch64::GPRRegBankID) {
       DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
       return false;
     }
 
     if (Ty != LLT::scalar(64)) {
       DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
                    << ", expected: " << LLT::scalar(64) << '\n');
       return false;
     }
 
     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
                                                              : AArch64::UMULHrr;
     I.setDesc(TII.get(NewOpc));
 
     // Now that we selected an opcode, we need to constrain the register
     // operands to use appropriate classes.
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FDIV:
 
   case TargetOpcode::G_OR:
   case TargetOpcode::G_SHL:
   case TargetOpcode::G_LSHR:
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_GEP: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
       return false;
 
     const unsigned OpSize = Ty.getSizeInBits();
 
     const unsigned DefReg = I.getOperand(0).getReg();
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
     if (NewOpc == I.getOpcode())
       return false;
 
     I.setDesc(TII.get(NewOpc));
     // FIXME: Should the type be always reset in setDesc?
 
     // Now that we selected an opcode, we need to constrain the register
     // operands to use appropriate classes.
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
   case TargetOpcode::G_PTR_MASK: {
     uint64_t Align = I.getOperand(2).getImm();
     if (Align >= 64 || Align == 0)
       return false;
 
     uint64_t Mask = ~((1ULL << Align) - 1);
     I.setDesc(TII.get(AArch64::ANDXri));
     I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
 
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
   case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_TRUNC: {
     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
 
     const unsigned DstReg = I.getOperand(0).getReg();
     const unsigned SrcReg = I.getOperand(1).getReg();
 
     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
 
     if (DstRB.getID() != SrcRB.getID()) {
       DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
       return false;
     }
 
     if (DstRB.getID() == AArch64::GPRRegBankID) {
       const TargetRegisterClass *DstRC =
           getRegClassForTypeOnBank(DstTy, DstRB, RBI);
       if (!DstRC)
         return false;
 
       const TargetRegisterClass *SrcRC =
           getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
       if (!SrcRC)
         return false;
 
       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
         DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
       if (DstRC == SrcRC) {
         // Nothing to be done
       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
                  SrcTy == LLT::scalar(64)) {
         llvm_unreachable("TableGen can import this case");
         return false;
       } else if (DstRC == &AArch64::GPR32RegClass &&
                  SrcRC == &AArch64::GPR64RegClass) {
         I.getOperand(1).setSubReg(AArch64::sub_32);
       } else {
         DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
         return false;
       }
 
       I.setDesc(TII.get(TargetOpcode::COPY));
       return true;
     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
       if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) {
         I.setDesc(TII.get(AArch64::XTNv4i16));
         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
         return true;
       }
     }
 
     return false;
   }
 
   case TargetOpcode::G_ANYEXT: {
     const unsigned DstReg = I.getOperand(0).getReg();
     const unsigned SrcReg = I.getOperand(1).getReg();
 
     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
     if (RBDst.getID() != AArch64::GPRRegBankID) {
       DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst << ", expected: GPR\n");
       return false;
     }
 
     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
     if (RBSrc.getID() != AArch64::GPRRegBankID) {
       DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc << ", expected: GPR\n");
       return false;
     }
 
     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
 
     if (DstSize == 0) {
       DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
       return false;
     }
 
     if (DstSize != 64 && DstSize > 32) {
       DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
                    << ", expected: 32 or 64\n");
       return false;
     }
     // At this point G_ANYEXT is just like a plain COPY, but we need
     // to explicitly form the 64-bit value if any.
     if (DstSize > 32) {
       unsigned ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
           .addDef(ExtSrc)
           .addImm(0)
           .addUse(SrcReg)
           .addImm(AArch64::sub_32);
       I.getOperand(1).setReg(ExtSrc);
     }
     return selectCopy(I, TII, MRI, TRI, RBI);
   }
 
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_SEXT: {
     unsigned Opcode = I.getOpcode();
     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
               SrcTy = MRI.getType(I.getOperand(1).getReg());
     const bool isSigned = Opcode == TargetOpcode::G_SEXT;
     const unsigned DefReg = I.getOperand(0).getReg();
     const unsigned SrcReg = I.getOperand(1).getReg();
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     if (RB.getID() != AArch64::GPRRegBankID) {
       DEBUG(dbgs() << TII.getName(I.getOpcode()) << " on bank: " << RB
                    << ", expected: GPR\n");
       return false;
     }
 
     MachineInstr *ExtI;
     if (DstTy == LLT::scalar(64)) {
       // FIXME: Can we avoid manually doing this?
       if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
         DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
                      << " operand\n");
         return false;
       }
 
       const unsigned SrcXReg =
           MRI.createVirtualRegister(&AArch64::GPR64RegClass);
       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
           .addDef(SrcXReg)
           .addImm(0)
           .addUse(SrcReg)
           .addImm(AArch64::sub_32);
 
       const unsigned NewOpc = isSigned ? AArch64::SBFMXri : AArch64::UBFMXri;
       ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
                  .addDef(DefReg)
                  .addUse(SrcXReg)
                  .addImm(0)
                  .addImm(SrcTy.getSizeInBits() - 1);
     } else if (DstTy.isScalar() && DstTy.getSizeInBits() <= 32) {
       const unsigned NewOpc = isSigned ? AArch64::SBFMWri : AArch64::UBFMWri;
       ExtI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
                  .addDef(DefReg)
                  .addUse(SrcReg)
                  .addImm(0)
                  .addImm(SrcTy.getSizeInBits() - 1);
     } else {
       return false;
     }
 
     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
 
     I.eraseFromParent();
     return true;
   }
 
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP:
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI: {
     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
               SrcTy = MRI.getType(I.getOperand(1).getReg());
     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
     if (NewOpc == Opcode)
       return false;
 
     I.setDesc(TII.get(NewOpc));
     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 
     return true;
   }
 
 
   case TargetOpcode::G_INTTOPTR:
     // The importer is currently unable to import pointer types since they
     // didn't exist in SelectionDAG.
     return selectCopy(I, TII, MRI, TRI, RBI);
 
   case TargetOpcode::G_BITCAST:
     // Imported SelectionDAG rules can handle every bitcast except those that
     // bitcast from a type to the same type. Ideally, these shouldn't occur
     // but we might not run an optimizer that deletes them.
     if (MRI.getType(I.getOperand(0).getReg()) ==
         MRI.getType(I.getOperand(1).getReg()))
       return selectCopy(I, TII, MRI, TRI, RBI);
     return false;
 
   case TargetOpcode::G_SELECT: {
     if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
       DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
                    << ", expected: " << LLT::scalar(1) << '\n');
       return false;
     }
 
     const unsigned CondReg = I.getOperand(1).getReg();
     const unsigned TReg = I.getOperand(2).getReg();
     const unsigned FReg = I.getOperand(3).getReg();
 
     unsigned CSelOpc = 0;
 
     if (Ty == LLT::scalar(32)) {
       CSelOpc = AArch64::CSELWr;
     } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
       CSelOpc = AArch64::CSELXr;
     } else {
       return false;
     }
 
     MachineInstr &TstMI =
         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
              .addDef(AArch64::WZR)
              .addUse(CondReg)
              .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
 
     MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc))
                                 .addDef(I.getOperand(0).getReg())
                                 .addUse(TReg)
                                 .addUse(FReg)
                                 .addImm(AArch64CC::NE);
 
     constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI);
     constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI);
 
     I.eraseFromParent();
     return true;
   }
   case TargetOpcode::G_ICMP: {
     if (Ty != LLT::scalar(32)) {
       DEBUG(dbgs() << "G_ICMP result has type: " << Ty
                    << ", expected: " << LLT::scalar(32) << '\n');
       return false;
     }
 
     unsigned CmpOpc = 0;
     unsigned ZReg = 0;
 
     LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
     if (CmpTy == LLT::scalar(32)) {
       CmpOpc = AArch64::SUBSWrr;
       ZReg = AArch64::WZR;
     } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
       CmpOpc = AArch64::SUBSXrr;
       ZReg = AArch64::XZR;
     } else {
       return false;
     }
 
     // CSINC increments the result by one when the condition code is false.
     // Therefore, we have to invert the predicate to get an increment by 1 when
     // the predicate is true.
     const AArch64CC::CondCode invCC =
         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(
             (CmpInst::Predicate)I.getOperand(1).getPredicate()));
 
     MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
                                .addDef(ZReg)
                                .addUse(I.getOperand(2).getReg())
                                .addUse(I.getOperand(3).getReg());
 
     MachineInstr &CSetMI =
         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
              .addDef(I.getOperand(0).getReg())
              .addUse(AArch64::WZR)
              .addUse(AArch64::WZR)
              .addImm(invCC);
 
     constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
     constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
 
     I.eraseFromParent();
     return true;
   }
 
   case TargetOpcode::G_FCMP: {
     if (Ty != LLT::scalar(32)) {
       DEBUG(dbgs() << "G_FCMP result has type: " << Ty
                    << ", expected: " << LLT::scalar(32) << '\n');
       return false;
     }
 
     unsigned CmpOpc = 0;
     LLT CmpTy = MRI.getType(I.getOperand(2).getReg());
     if (CmpTy == LLT::scalar(32)) {
       CmpOpc = AArch64::FCMPSrr;
     } else if (CmpTy == LLT::scalar(64)) {
       CmpOpc = AArch64::FCMPDrr;
     } else {
       return false;
     }
 
     // FIXME: regbank
 
     AArch64CC::CondCode CC1, CC2;
     changeFCMPPredToAArch64CC(
         (CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);
 
     MachineInstr &CmpMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
                                .addUse(I.getOperand(2).getReg())
                                .addUse(I.getOperand(3).getReg());
 
     const unsigned DefReg = I.getOperand(0).getReg();
     unsigned Def1Reg = DefReg;
     if (CC2 != AArch64CC::AL)
       Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
 
     MachineInstr &CSetMI =
         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
              .addDef(Def1Reg)
              .addUse(AArch64::WZR)
              .addUse(AArch64::WZR)
              .addImm(getInvertedCondCode(CC1));
 
     if (CC2 != AArch64CC::AL) {
       unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
       MachineInstr &CSet2MI =
           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
                .addDef(Def2Reg)
                .addUse(AArch64::WZR)
                .addUse(AArch64::WZR)
                .addImm(getInvertedCondCode(CC2));
       MachineInstr &OrMI =
           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
                .addDef(DefReg)
                .addUse(Def1Reg)
                .addUse(Def2Reg);
       constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
       constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
     }
 
     constrainSelectedInstRegOperands(CmpMI, TII, TRI, RBI);
     constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
 
     I.eraseFromParent();
     return true;
   }
   case TargetOpcode::G_VASTART:
     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
                                 : selectVaStartAAPCS(I, MF, MRI);
   case TargetOpcode::G_IMPLICIT_DEF:
     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
     return true;
   }
 
   return false;
 }
 
 /// SelectArithImmed - Select an immediate value that can be represented as
 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
 /// Val set to the 12-bit value and Shift set to the shifter operand.
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
   MachineInstr &MI = *Root.getParent();
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // This function is called from the addsub_shifted_imm ComplexPattern,
   // which lists [imm] as the list of opcode it's interested in, however
   // we still need to check whether the operand is actually an immediate
   // here because the ComplexPattern opcode list is only used in
   // root-level opcode matching.
   uint64_t Immed;
   if (Root.isImm())
     Immed = Root.getImm();
   else if (Root.isCImm())
     Immed = Root.getCImm()->getZExtValue();
   else if (Root.isReg()) {
     MachineInstr *Def = MRI.getVRegDef(Root.getReg());
     if (Def->getOpcode() != TargetOpcode::G_CONSTANT)
       return None;
     MachineOperand &Op1 = Def->getOperand(1);
     if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64)
       return None;
     Immed = Op1.getCImm()->getZExtValue();
   } else
     return None;
 
   unsigned ShiftAmt;
 
   if (Immed >> 12 == 0) {
     ShiftAmt = 0;
   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
     ShiftAmt = 12;
     Immed = Immed >> 12;
   } else
     return None;
 
   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
   }};
 }
 
 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
 /// should only match when there is an offset that is not valid for a scaled
 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
 /// memory reference, which is needed here to know what is valid for a scaled
 /// immediate.
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
                                                    unsigned Size) const {
   MachineRegisterInfo &MRI =
       Root.getParent()->getParent()->getParent()->getRegInfo();
 
   if (!Root.isReg())
     return None;
 
   if (!isBaseWithConstantOffset(Root, MRI))
     return None;
 
   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
   if (!RootDef)
     return None;
 
   MachineOperand &OffImm = RootDef->getOperand(2);
   if (!OffImm.isReg())
     return None;
   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
   if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
     return None;
   int64_t RHSC;
   MachineOperand &RHSOp1 = RHS->getOperand(1);
   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
     return None;
   RHSC = RHSOp1.getCImm()->getSExtValue();
 
   // If the offset is valid as a scaled immediate, don't match here.
   if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
     return None;
   if (RHSC >= -256 && RHSC < 256) {
     MachineOperand &Base = RootDef->getOperand(1);
     return {{
         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
     }};
   }
   return None;
 }
 
 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
 /// "Size" argument is the size in bytes of the memory reference, which
 /// determines the scale.
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
                                                   unsigned Size) const {
   MachineRegisterInfo &MRI =
       Root.getParent()->getParent()->getParent()->getRegInfo();
 
   if (!Root.isReg())
     return None;
 
   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
   if (!RootDef)
     return None;
 
   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
     return {{
         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
     }};
   }
 
   if (isBaseWithConstantOffset(Root, MRI)) {
     MachineOperand &LHS = RootDef->getOperand(1);
     MachineOperand &RHS = RootDef->getOperand(2);
     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
     if (LHSDef && RHSDef) {
       int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
       unsigned Scale = Log2_32(Size);
       if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
         if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
           return {{
               [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
               [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
           }};
 
         return {{
             [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
         }};
       }
     }
   }
 
   // Before falling back to our general case, check if the unscaled
   // instructions can handle this. If so, that's preferable.
   if (selectAddrModeUnscaled(Root, Size).hasValue())
     return None;
 
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
   }};
 }
 
 namespace llvm {
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
                                  AArch64Subtarget &Subtarget,
                                  AArch64RegisterBankInfo &RBI) {
   return new AArch64InstructionSelector(TM, Subtarget, RBI);
 }
 }
Index: head/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp	(revision 328753)
@@ -1,252 +1,255 @@
 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the AArch64 specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64Subtarget.h"
 
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64PBQPRegAlloc.h"
 #include "AArch64TargetMachine.h"
 
 #include "AArch64CallLowering.h"
 #include "AArch64LegalizerInfo.h"
 #include "AArch64RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-subtarget"
 
 #define GET_SUBTARGETINFO_CTOR
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "AArch64GenSubtargetInfo.inc"
 
 static cl::opt<bool>
 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
                      "converter pass"), cl::init(true), cl::Hidden);
 
 // If OS supports TBI, use this flag to enable it.
 static cl::opt<bool>
 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
                          "an address is ignored"), cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     UseNonLazyBind("aarch64-enable-nonlazybind",
                    cl::desc("Call nonlazybind functions via direct GOT load"),
                    cl::init(false), cl::Hidden);
 
 AArch64Subtarget &
 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
                                                   StringRef CPUString) {
   // Determine default and user-specified characteristics
 
   if (CPUString.empty())
     CPUString = "generic";
 
   ParseSubtargetFeatures(CPUString, FS);
   initializeProperties();
 
   return *this;
 }
 
 void AArch64Subtarget::initializeProperties() {
   // Initialize CPU specific properties. We should add a tablegen feature for
   // this in the future so we can specify it together with the subtarget
   // features.
   switch (ARMProcFamily) {
   case Cyclone:
     CacheLineSize = 64;
     PrefetchDistance = 280;
     MinPrefetchStride = 2048;
     MaxPrefetchIterationsAhead = 3;
     break;
   case CortexA57:
     MaxInterleaveFactor = 4;
     PrefFunctionAlignment = 4;
     break;
   case ExynosM1:
     MaxInterleaveFactor = 4;
     MaxJumpTableSize = 8;
     PrefFunctionAlignment = 4;
     PrefLoopAlignment = 3;
     break;
   case Falkor:
     MaxInterleaveFactor = 4;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     CacheLineSize = 128;
     PrefetchDistance = 820;
     MinPrefetchStride = 2048;
     MaxPrefetchIterationsAhead = 8;
     break;
   case Saphira:
     MaxInterleaveFactor = 4;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
   case Kryo:
     MaxInterleaveFactor = 4;
     VectorInsertExtractBaseCost = 2;
     CacheLineSize = 128;
     PrefetchDistance = 740;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 11;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
   case ThunderX2T99:
     CacheLineSize = 64;
     PrefFunctionAlignment = 3;
     PrefLoopAlignment = 2;
     MaxInterleaveFactor = 4;
     PrefetchDistance = 128;
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 4;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
   case ThunderX:
   case ThunderXT88:
   case ThunderXT81:
   case ThunderXT83:
     CacheLineSize = 128;
     PrefFunctionAlignment = 3;
     PrefLoopAlignment = 2;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
   case CortexA35: break;
   case CortexA53:
     PrefFunctionAlignment = 3;
     break;
   case CortexA55: break;
   case CortexA72:
   case CortexA73:
   case CortexA75:
     PrefFunctionAlignment = 4;
     break;
   case Others: break;
   }
 }
 
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS),
       ReserveX18(TT.isOSDarwin() || TT.isOSWindows()), IsLittle(LittleEndian),
       TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
       TLInfo(TM, *this) {
   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
   Legalizer.reset(new AArch64LegalizerInfo(*this));
 
   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
 
   // FIXME: At this point, we can't rely on Subtarget having RBI.
   // It's awkward to mix passing RBI and the Subtarget; should we pass
   // TII/TRI as well?
   InstSelector.reset(createAArch64InstructionSelector(
       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
 
   RegBankInfo.reset(RBI);
 }
 
 const CallLowering *AArch64Subtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
 
 const InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
   return InstSelector.get();
 }
 
 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
   return Legalizer.get();
 }
 
 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
   return RegBankInfo.get();
 }
 
 /// Find the target operand flags that describe how a global value should be
 /// referenced for the current subtarget.
 unsigned char
 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
                                           const TargetMachine &TM) const {
   // MachO large model always goes via a GOT, simply to get a single 8-byte
   // absolute relocation on all global addresses.
   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
     return AArch64II::MO_GOT;
 
+  unsigned Flags = GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
+                                                  : AArch64II::MO_NO_FLAG;
+
   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
-    return AArch64II::MO_GOT;
+    return AArch64II::MO_GOT | Flags;
 
   // The small code model's direct accesses use ADRP, which cannot
   // necessarily produce the value 0 (if the code is above 4GB).
   if (useSmallAddressing() && GV->hasExternalWeakLinkage())
-    return AArch64II::MO_GOT;
+    return AArch64II::MO_GOT | Flags;
 
-  return AArch64II::MO_NO_FLAG;
+  return Flags;
 }
 
 unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
     const GlobalValue *GV, const TargetMachine &TM) const {
   // MachO large model always goes via a GOT, because we don't have the
   // relocations available to do anything else..
   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
       !GV->hasInternalLinkage())
     return AArch64II::MO_GOT;
 
   // NonLazyBind goes via GOT unless we know it's available locally.
   auto *F = dyn_cast<Function>(GV);
   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
     return AArch64II::MO_GOT;
 
   return AArch64II::MO_NO_FLAG;
 }
 
 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
                                            unsigned NumRegionInstrs) const {
   // LNT run (at least on Cyclone) showed reasonably significant gains for
   // bi-directional scheduling. 253.perlbmk.
   Policy.OnlyTopDown = false;
   Policy.OnlyBottomUp = false;
   // Enabling or Disabling the latency heuristic is a close call: It seems to
   // help nearly no benchmark on out-of-order architectures, on the other hand
   // it regresses register pressure on a few benchmarking.
   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
 }
 
 bool AArch64Subtarget::enableEarlyIfConversion() const {
   return EnableEarlyIfConvert;
 }
 
 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
   if (!UseAddressTopByteIgnored)
     return false;
 
   if (TargetTriple.isiOS()) {
     unsigned Major, Minor, Micro;
     TargetTriple.getiOSVersion(Major, Minor, Micro);
     return Major >= 8;
   }
 
   return false;
 }
 
 std::unique_ptr<PBQPRAConstraint>
 AArch64Subtarget::getCustomPBQPConstraints() const {
   return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
 }
Index: head/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp	(revision 328753)
@@ -1,436 +1,444 @@
 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// \brief This pass inserts branches on the 0 exec mask over divergent branches
 /// branches when it's expected that jumping over the untaken control flow will
 /// be cheaper than having every workitem no-op through it.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "si-insert-skips"
 
 static cl::opt<unsigned> SkipThresholdFlag(
   "amdgpu-skip-threshold",
   cl::desc("Number of instructions before jumping over divergent control flow"),
   cl::init(12), cl::Hidden);
 
 namespace {
 
 class SIInsertSkips : public MachineFunctionPass {
 private:
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
   unsigned SkipThreshold = 0;
 
   bool shouldSkip(const MachineBasicBlock &From,
                   const MachineBasicBlock &To) const;
 
   bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
 
   void kill(MachineInstr &MI);
 
   MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
 public:
   static char ID;
 
   SIInsertSkips() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
     return "SI insert s_cbranch_execz instructions";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
 
 } // end anonymous namespace
 
 char SIInsertSkips::ID = 0;
 
 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
                 "SI insert s_cbranch_execz instructions", false, false)
 
 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
 
 static bool opcodeEmitsNoInsts(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::IMPLICIT_DEF:
   case TargetOpcode::KILL:
   case TargetOpcode::BUNDLE:
   case TargetOpcode::CFI_INSTRUCTION:
   case TargetOpcode::EH_LABEL:
   case TargetOpcode::GC_LABEL:
   case TargetOpcode::DBG_VALUE:
     return true;
   default:
     return false;
   }
 }
 
 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
                                const MachineBasicBlock &To) const {
   if (From.succ_empty())
     return false;
 
   unsigned NumInstr = 0;
   const MachineFunction *MF = From.getParent();
 
   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
        MBBI != End && MBBI != ToI; ++MBBI) {
     const MachineBasicBlock &MBB = *MBBI;
 
     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
          NumInstr < SkipThreshold && I != E; ++I) {
       if (opcodeEmitsNoInsts(I->getOpcode()))
         continue;
 
       // FIXME: Since this is required for correctness, this should be inserted
       // during SILowerControlFlow.
 
       // When a uniform loop is inside non-uniform control flow, the branch
       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
       // when EXEC = 0. We should skip the loop lest it becomes infinite.
       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
         return true;
 
       // V_READFIRSTLANE/V_READLANE destination register may be used as operand
       // by some SALU instruction. If exec mask is zero vector instruction
       // defining the register that is used by the scalar one is not executed
       // and scalar instruction will operate on undefined data. For
       // V_READFIRSTLANE/V_READLANE we should avoid predicated execution.
       if ((I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) ||
           (I->getOpcode() == AMDGPU::V_READLANE_B32)) {
         return true;
       }
 
       if (I->isInlineAsm()) {
         const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
         const char *AsmStr = I->getOperand(0).getSymbolName();
 
         // inlineasm length estimate is number of bytes assuming the longest
         // instruction.
         uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
         NumInstr += MaxAsmSize / MAI->getMaxInstLength();
       } else {
         ++NumInstr;
       }
 
       if (NumInstr >= SkipThreshold)
         return true;
     }
   }
 
   return false;
 }
 
 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction *MF = MBB.getParent();
 
   if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
       !shouldSkip(MBB, MBB.getParent()->back()))
     return false;
 
   MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
 
   const DebugLoc &DL = MI.getDebugLoc();
 
   // If the exec mask is non-zero, skip the next two instructions
   BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
     .addMBB(&NextBB);
 
   MachineBasicBlock::iterator Insert = SkipBB->begin();
 
   // Exec mask is zero: Export to NULL target...
   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
     .addImm(0x09) // V_008DFC_SQ_EXP_NULL
     .addReg(AMDGPU::VGPR0, RegState::Undef)
     .addReg(AMDGPU::VGPR0, RegState::Undef)
     .addReg(AMDGPU::VGPR0, RegState::Undef)
     .addReg(AMDGPU::VGPR0, RegState::Undef)
     .addImm(1)  // vm
     .addImm(0)  // compr
     .addImm(0); // en
 
   // ... and terminate wavefront.
   BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
 
   return true;
 }
 
 void SIInsertSkips::kill(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
   switch (MI.getOpcode()) {
   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
     unsigned Opcode = 0;
 
     // The opcodes are inverted because the inline immediate has to be
     // the first operand, e.g. from "x < imm" to "imm > x"
     switch (MI.getOperand(2).getImm()) {
     case ISD::SETOEQ:
     case ISD::SETEQ:
-      Opcode = AMDGPU::V_CMPX_EQ_F32_e32;
+      Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
       break;
     case ISD::SETOGT:
     case ISD::SETGT:
-      Opcode = AMDGPU::V_CMPX_LT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LT_F32_e64;
       break;
     case ISD::SETOGE:
     case ISD::SETGE:
-      Opcode = AMDGPU::V_CMPX_LE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LE_F32_e64;
       break;
     case ISD::SETOLT:
     case ISD::SETLT:
-      Opcode = AMDGPU::V_CMPX_GT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_GT_F32_e64;
       break;
     case ISD::SETOLE:
     case ISD::SETLE:
-      Opcode = AMDGPU::V_CMPX_GE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_GE_F32_e64;
       break;
     case ISD::SETONE:
     case ISD::SETNE:
-      Opcode = AMDGPU::V_CMPX_LG_F32_e32;
+      Opcode = AMDGPU::V_CMPX_LG_F32_e64;
       break;
     case ISD::SETO:
-      Opcode = AMDGPU::V_CMPX_O_F32_e32;
+      Opcode = AMDGPU::V_CMPX_O_F32_e64;
       break;
     case ISD::SETUO:
-      Opcode = AMDGPU::V_CMPX_U_F32_e32;
+      Opcode = AMDGPU::V_CMPX_U_F32_e64;
       break;
     case ISD::SETUEQ:
-      Opcode = AMDGPU::V_CMPX_NLG_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
       break;
     case ISD::SETUGT:
-      Opcode = AMDGPU::V_CMPX_NGE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
       break;
     case ISD::SETUGE:
-      Opcode = AMDGPU::V_CMPX_NGT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
       break;
     case ISD::SETULT:
-      Opcode = AMDGPU::V_CMPX_NLE_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
       break;
     case ISD::SETULE:
-      Opcode = AMDGPU::V_CMPX_NLT_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
       break;
     case ISD::SETUNE:
-      Opcode = AMDGPU::V_CMPX_NEQ_F32_e32;
+      Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
       break;
     default:
       llvm_unreachable("invalid ISD:SET cond code");
     }
 
-    // TODO: Allow this:
-    if (!MI.getOperand(0).isReg() ||
-        !TRI->isVGPR(MBB.getParent()->getRegInfo(),
-                     MI.getOperand(0).getReg()))
-      llvm_unreachable("SI_KILL operand should be a VGPR");
+    assert(MI.getOperand(0).isReg());
 
-    BuildMI(MBB, &MI, DL, TII->get(Opcode))
-        .add(MI.getOperand(1))
-        .add(MI.getOperand(0));
+    if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
+                    MI.getOperand(0).getReg())) {
+      Opcode = AMDGPU::getVOPe32(Opcode);
+      BuildMI(MBB, &MI, DL, TII->get(Opcode))
+          .add(MI.getOperand(1))
+          .add(MI.getOperand(0));
+    } else {
+      BuildMI(MBB, &MI, DL, TII->get(Opcode))
+          .addReg(AMDGPU::VCC, RegState::Define)
+          .addImm(0)  // src0 modifiers
+          .add(MI.getOperand(1))
+          .addImm(0)  // src1 modifiers
+          .add(MI.getOperand(0))
+          .addImm(0);  // omod
+    }
     break;
   }
   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     const MachineOperand &Op = MI.getOperand(0);
     int64_t KillVal = MI.getOperand(1).getImm();
     assert(KillVal == 0 || KillVal == -1);
 
     // Kill all threads if Op0 is an immediate and equal to the Kill value.
     if (Op.isImm()) {
       int64_t Imm = Op.getImm();
       assert(Imm == 0 || Imm == -1);
 
       if (Imm == KillVal)
         BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
           .addImm(0);
       break;
     }
 
     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
     BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
         .addReg(AMDGPU::EXEC)
         .add(Op);
     break;
   }
   default:
     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
   }
 }
 
 MachineBasicBlock *SIInsertSkips::insertSkipBlock(
   MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
   MachineFunction *MF = MBB.getParent();
 
   MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
   MachineFunction::iterator MBBI(MBB);
   ++MBBI;
 
   MF->insert(MBBI, SkipBB);
   MBB.addSuccessor(SkipBB);
 
   return SkipBB;
 }
 
 // Returns true if a branch over the block was inserted.
 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
                                    MachineBasicBlock &SrcMBB) {
   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
 
   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
     return false;
 
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
 
   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
     .addMBB(DestBB);
 
   return true;
 }
 
 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
   SkipThreshold = SkipThresholdFlag;
 
   bool HaveKill = false;
   bool MadeChange = false;
 
   // Track depth of exec mask, divergent branches.
   SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
 
   MachineFunction::iterator NextBB;
 
   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; BI = NextBB) {
     NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
     bool HaveSkipBlock = false;
 
     if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
       // Reached convergence point for last divergent branch.
       ExecBranchStack.pop_back();
     }
 
     if (HaveKill && ExecBranchStack.empty()) {
       HaveKill = false;
 
       // TODO: Insert skip if exec is 0?
     }
 
     MachineBasicBlock::iterator I, Next;
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
       case AMDGPU::SI_MASK_BRANCH:
         ExecBranchStack.push_back(MI.getOperand(0).getMBB());
         MadeChange |= skipMaskBranch(MI, MBB);
         break;
 
       case AMDGPU::S_BRANCH:
         // Optimize out branches to the next block.
         // FIXME: Shouldn't this be handled by BranchFolding?
         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
           MI.eraseFromParent();
         } else if (HaveSkipBlock) {
           // Remove the given unconditional branch when a skip block has been
           // inserted after the current one and let skip the two instructions
           // performing the kill if the exec mask is non-zero.
           MI.eraseFromParent();
         }
         break;
 
       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
       case AMDGPU::SI_KILL_I1_TERMINATOR:
         MadeChange = true;
         kill(MI);
 
         if (ExecBranchStack.empty()) {
           if (skipIfDead(MI, *NextBB)) {
             HaveSkipBlock = true;
             NextBB = std::next(BI);
             BE = MF.end();
           }
         } else {
           HaveKill = true;
         }
 
         MI.eraseFromParent();
         break;
 
       case AMDGPU::SI_RETURN_TO_EPILOG:
         // FIXME: Should move somewhere else
         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
 
         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
         // because external bytecode will be appended at the end.
         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
           // the end and jump there.
           if (!EmptyMBBAtEnd) {
             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
             MF.insert(MF.end(), EmptyMBBAtEnd);
           }
 
           MBB.addSuccessor(EmptyMBBAtEnd);
           BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
             .addMBB(EmptyMBBAtEnd);
           I->eraseFromParent();
         }
         break;
 
       default:
         break;
       }
     }
   }
 
   return MadeChange;
 }
Index: head/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
===================================================================
--- head/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp	(revision 328753)
@@ -1,269 +1,269 @@
 //===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file includes code for rendering MCInst instances as Intel-style
 // assembly.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86InstComments.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstdint>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
 #include "X86GenAsmWriter1.inc"
 
 void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
 
 void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                     StringRef Annot,
                                     const MCSubtargetInfo &STI) {
   const MCInstrDesc &Desc = MII.get(MI->getOpcode());
   uint64_t TSFlags = Desc.TSFlags;
+  unsigned Flags = MI->getFlags();
 
-  if (TSFlags & X86II::LOCK)
+  if ((TSFlags & X86II::LOCK) || (Flags & X86::IP_HAS_LOCK))
     OS << "\tlock\t";
 
-  unsigned Flags = MI->getFlags();
   if (Flags & X86::IP_HAS_REPEAT_NE)
     OS << "\trepne\t";
   else if (Flags & X86::IP_HAS_REPEAT)
     OS << "\trep\t";
 
   printInstruction(MI, OS);
 
   // Next always print the annotation.
   printAnnotation(OS, Annot);
 
   // If verbose assembly is enabled, we can print some informative comments.
   if (CommentStream)
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
 }
 
 void X86IntelInstPrinter::printSSEAVXCC(const MCInst *MI, unsigned Op,
                                         raw_ostream &O) {
   int64_t Imm = MI->getOperand(Op).getImm();
   switch (Imm) {
   default: llvm_unreachable("Invalid avxcc argument!");
   case    0: O << "eq"; break;
   case    1: O << "lt"; break;
   case    2: O << "le"; break;
   case    3: O << "unord"; break;
   case    4: O << "neq"; break;
   case    5: O << "nlt"; break;
   case    6: O << "nle"; break;
   case    7: O << "ord"; break;
   case    8: O << "eq_uq"; break;
   case    9: O << "nge"; break;
   case  0xa: O << "ngt"; break;
   case  0xb: O << "false"; break;
   case  0xc: O << "neq_oq"; break;
   case  0xd: O << "ge"; break;
   case  0xe: O << "gt"; break;
   case  0xf: O << "true"; break;
   case 0x10: O << "eq_os"; break;
   case 0x11: O << "lt_oq"; break;
   case 0x12: O << "le_oq"; break;
   case 0x13: O << "unord_s"; break;
   case 0x14: O << "neq_us"; break;
   case 0x15: O << "nlt_uq"; break;
   case 0x16: O << "nle_uq"; break;
   case 0x17: O << "ord_s"; break;
   case 0x18: O << "eq_us"; break;
   case 0x19: O << "nge_uq"; break;
   case 0x1a: O << "ngt_uq"; break;
   case 0x1b: O << "false_os"; break;
   case 0x1c: O << "neq_os"; break;
   case 0x1d: O << "ge_oq"; break;
   case 0x1e: O << "gt_oq"; break;
   case 0x1f: O << "true_us"; break;
   }
 }
 
 void X86IntelInstPrinter::printXOPCC(const MCInst *MI, unsigned Op,
                                      raw_ostream &O) {
   int64_t Imm = MI->getOperand(Op).getImm();
   switch (Imm) {
   default: llvm_unreachable("Invalid xopcc argument!");
   case 0: O << "lt"; break;
   case 1: O << "le"; break;
   case 2: O << "gt"; break;
   case 3: O << "ge"; break;
   case 4: O << "eq"; break;
   case 5: O << "neq"; break;
   case 6: O << "false"; break;
   case 7: O << "true"; break;
   }
 }
 
 void X86IntelInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
                                                raw_ostream &O) {
   int64_t Imm = MI->getOperand(Op).getImm() & 0x3;
   switch (Imm) {
   case 0: O << "{rn-sae}"; break;
   case 1: O << "{rd-sae}"; break;
   case 2: O << "{ru-sae}"; break;
   case 3: O << "{rz-sae}"; break;
   }
 }
 
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.
 void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm())
     O << formatImm(Op.getImm());
   else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
     // If a symbolic branch target was added as a constant expression then print
     // that address in hex.
     const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
     int64_t Address;
     if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
       O << formatHex((uint64_t)Address);
     }
     else {
       // Otherwise, just print the expression.
       Op.getExpr()->print(O, &MAI);
     }
   }
 }
 
 void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     printRegName(O, Op.getReg());
   } else if (Op.isImm()) {
     O << formatImm((int64_t)Op.getImm());
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << "offset ";
     Op.getExpr()->print(O, &MAI);
   }
 }
 
 void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
                                             raw_ostream &O) {
   const MCOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
   unsigned ScaleVal         = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
   const MCOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
   const MCOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
   const MCOperand &SegReg   = MI->getOperand(Op+X86::AddrSegmentReg);
 
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+X86::AddrSegmentReg, O);
     O << ':';
   }
 
   O << '[';
 
   bool NeedPlus = false;
   if (BaseReg.getReg()) {
     printOperand(MI, Op+X86::AddrBaseReg, O);
     NeedPlus = true;
   }
 
   if (IndexReg.getReg()) {
     if (NeedPlus) O << " + ";
     if (ScaleVal != 1)
       O << ScaleVal << '*';
     printOperand(MI, Op+X86::AddrIndexReg, O);
     NeedPlus = true;
   }
 
   if (!DispSpec.isImm()) {
     if (NeedPlus) O << " + ";
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
     DispSpec.getExpr()->print(O, &MAI);
   } else {
     int64_t DispVal = DispSpec.getImm();
     if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
       if (NeedPlus) {
         if (DispVal > 0)
           O << " + ";
         else {
           O << " - ";
           DispVal = -DispVal;
         }
       }
       O << formatImm(DispVal);
     }
   }
 
   O << ']';
 }
 
 void X86IntelInstPrinter::printSrcIdx(const MCInst *MI, unsigned Op,
                                       raw_ostream &O) {
   const MCOperand &SegReg   = MI->getOperand(Op+1);
 
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+1, O);
     O << ':';
   }
   O << '[';
   printOperand(MI, Op, O);
   O << ']';
 }
 
 void X86IntelInstPrinter::printDstIdx(const MCInst *MI, unsigned Op,
                                       raw_ostream &O) {
   // DI accesses are always ES-based.
   O << "es:[";
   printOperand(MI, Op, O);
   O << ']';
 }
 
 void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
                                          raw_ostream &O) {
   const MCOperand &DispSpec = MI->getOperand(Op);
   const MCOperand &SegReg   = MI->getOperand(Op+1);
 
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+1, O);
     O << ':';
   }
 
   O << '[';
 
   if (DispSpec.isImm()) {
     O << formatImm(DispSpec.getImm());
   } else {
     assert(DispSpec.isExpr() && "non-immediate displacement?");
     DispSpec.getExpr()->print(O, &MAI);
   }
 
   O << ']';
 }
 
 void X86IntelInstPrinter::printU8Imm(const MCInst *MI, unsigned Op,
                                      raw_ostream &O) {
   if (MI->getOperand(Op).isExpr())
     return MI->getOperand(Op).getExpr()->print(O, &MAI);
 
   O << formatImm(MI->getOperand(Op).getImm() & 0xff);
 }
Index: head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp	(revision 328753)
@@ -1,38736 +1,38737 @@
 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the interfaces that X86 uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86CallingConv.h"
 #include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86IntrinsicsInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86ShuffleDecodeConstantPool.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <bitset>
 #include <cctype>
 #include <numeric>
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-isel"
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 static cl::opt<bool> ExperimentalVectorWideningLegalization(
     "x86-experimental-vector-widening-legalization", cl::init(false),
     cl::desc("Enable an experimental vector type legalization through widening "
              "rather than promotion."),
     cl::Hidden);
 
 static cl::opt<int> ExperimentalPrefLoopAlignment(
     "x86-experimental-pref-loop-alignment", cl::init(4),
     cl::desc("Sets the preferable loop alignment for experiments "
              "(the last x86-experimental-pref-loop-alignment bits"
              " of the loop header PC will be 0)."),
     cl::Hidden);
 
 static cl::opt<bool> MulConstantOptimization(
     "mul-constant-optimization", cl::init(true),
     cl::desc("Replace 'mul x, Const' with more effective instructions like "
              "SHIFT, LEA, etc."),
     cl::Hidden);
 
 /// Call this when the user attempts to do something unsupported, like
 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
 /// report_fatal_error, so calling code should attempt to recover without
 /// crashing.
 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
                              const char *Msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
 }
 
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
   X86ScalarSSEf64 = Subtarget.hasSSE2();
   X86ScalarSSEf32 = Subtarget.hasSSE1();
   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
 
   // Set up the TargetLowering object.
 
   // X86 is weird. It always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   // For 64-bit, since we have so many registers, use the ILP scheduler.
   // For 32-bit, use the register pressure specific scheduling.
   // For Atom, always use ILP scheduling.
   if (Subtarget.isAtom())
     setSchedulingPreference(Sched::ILP);
   else if (Subtarget.is64Bit())
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
   // Bypass expensive divides and use cheaper ones.
   if (TM.getOptLevel() >= CodeGenOpt::Default) {
     if (Subtarget.hasSlowDivide32())
       addBypassSlowDiv(32, 8);
     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
       addBypassSlowDiv(64, 32);
   }
 
   if (Subtarget.isTargetKnownWindowsMSVC() ||
       Subtarget.isTargetWindowsItanium()) {
     // Setup Windows compiler runtime calls.
     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
     setLibcallName(RTLIB::SREM_I64, "_allrem");
     setLibcallName(RTLIB::UREM_I64, "_aullrem");
     setLibcallName(RTLIB::MUL_I64, "_allmul");
     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
   }
 
   if (Subtarget.isTargetDarwin()) {
     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
     setUseUnderscoreSetJmp(false);
     setUseUnderscoreLongJmp(false);
   } else if (Subtarget.isTargetWindowsGNU()) {
     // MS runtime is weird: it exports _setjmp, but longjmp!
     setUseUnderscoreSetJmp(true);
     setUseUnderscoreLongJmp(false);
   } else {
     setUseUnderscoreSetJmp(true);
     setUseUnderscoreLongJmp(true);
   }
 
   // Set up the register classes.
   addRegisterClass(MVT::i8, &X86::GR8RegClass);
   addRegisterClass(MVT::i16, &X86::GR16RegClass);
   addRegisterClass(MVT::i32, &X86::GR32RegClass);
   if (Subtarget.is64Bit())
     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 
   for (MVT VT : MVT::integer_valuetypes())
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
   // We don't accept any truncstore of integer registers.
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // SETOEQ and SETUNE require checking two conditions.
   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 
   // Integer absolute.
   if (Subtarget.hasCMov()) {
     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
     if (Subtarget.is64Bit())
       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
   }
 
   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
   // operation.
   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 
   if (Subtarget.is64Bit()) {
     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
       // f32/f64 are legal, f80 is custom.
       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
     else
       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
   } else if (!Subtarget.useSoftFloat()) {
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
     // We have an algorithm for SSE2, and we turn this into a 64-bit
     // FILD or VCVTUSI2SS/SD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
   }
 
   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
   // this operation.
   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 
   if (!Subtarget.useSoftFloat()) {
     // SSE has no i16 to fp conversion, only i32.
     if (X86ScalarSSEf32) {
       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
       // f32 and f64 cases are Legal, f80 case is not
       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
     } else {
       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
     }
   } else {
     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
   }
 
   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
   // this operation.
   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 
   if (!Subtarget.useSoftFloat()) {
     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
     // are Legal, f80 is custom lowered.
     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 
     if (X86ScalarSSEf32) {
       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
       // f32 and f64 cases are Legal, f80 case is not
       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
     } else {
       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
     }
   } else {
     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
   }
 
   // Handle FP_TO_UINT by promoting the destination to a larger signed
   // conversion.
   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 
   if (Subtarget.is64Bit()) {
     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
     } else {
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
     }
   } else if (!Subtarget.useSoftFloat()) {
     // Since AVX is a superset of SSE3, only check for SSE here.
     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
       // Expand FP_TO_UINT into a select.
       // FIXME: We would like to use a Custom expander here eventually to do
       // the optimal thing for SSE vs. the default expansion in the legalizer.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
     else
       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
       // With SSE3 we can use fisttpll to convert to a signed i64; without
       // SSE, we're stuck with a fistpll.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 
     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
   }
 
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
   if (!X86ScalarSSEf64) {
     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
       // Without SSE, i64->f64 goes through memory.
       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
     }
   } else if (!Subtarget.is64Bit())
     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 
   // Scalar integer divide and remainder are lowered to use operations that
   // produce two results, to match the available instructions. This exposes
   // the two-result form to trivial CSE, which is able to combine x/y and x%y
   // into a single instruction.
   //
   // Scalar integer multiply-high is also lowered to use two-result
   // operations, to match the available instructions. However, plain multiply
   // (low) operations are left as Legal, as there are single-result
   // instructions for this in x86. Using the two-result multiply instructions
   // when both high and low results are needed must be arranged by dagcombine.
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
   }
 
   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::BR_CC,     VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
   }
   if (Subtarget.is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 
   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 
   // Promote the i8 variants and force them on up to i32 which has a shorter
   // encoding.
   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
   if (!Subtarget.hasBMI()) {
     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
     }
   }
 
   if (Subtarget.hasLZCNT()) {
     // When promoting the i8 variants, force them to i32 for a shorter
     // encoding.
     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
   } else {
     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
     }
   }
 
   // Special handling for half-precision floating point conversions.
   // If we don't have F16C support, then lower half float conversions
   // into library calls.
   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
   }
 
   // There's never any support for operations beyond MVT::f32.
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 
   if (Subtarget.hasPOPCNT()) {
     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
   } else {
     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
     if (Subtarget.is64Bit())
       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
   }
 
   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 
   if (!Subtarget.hasMOVBE())
     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 
   // These should be promoted to a larger select which is supported.
   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
   // X86 wants to expand cmov itself.
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::SETCC, VT, Custom);
   }
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::SETCC,  VT, Custom);
   }
 
   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 
   // Darwin ABI issue.
   for (auto VT : { MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::ConstantPool    , VT, Custom);
     setOperationAction(ISD::JumpTable       , VT, Custom);
     setOperationAction(ISD::GlobalAddress   , VT, Custom);
     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
     setOperationAction(ISD::BlockAddress    , VT, Custom);
   }
 
   // 64-bit shl, sra, srl (iff 32-bit x86)
   for (auto VT : { MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SHL_PARTS, VT, Custom);
     setOperationAction(ISD::SRA_PARTS, VT, Custom);
     setOperationAction(ISD::SRL_PARTS, VT, Custom);
   }
 
   if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 
   // Expand certain atomics
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
   }
 
   if (Subtarget.hasCmpxchg16b()) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
   }
 
   // FIXME - use subtarget debug flags
   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
   }
 
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
   bool Is64Bit = Subtarget.is64Bit();
   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 
   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 
   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
                                                      : &X86::FR32RegClass);
     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
                                                      : &X86::FR64RegClass);
 
     for (auto VT : { MVT::f32, MVT::f64 }) {
       // Use ANDPD to simulate FABS.
       setOperationAction(ISD::FABS, VT, Custom);
 
       // Use XORP to simulate FNEG.
       setOperationAction(ISD::FNEG, VT, Custom);
 
       // Use ANDPD and ORPD to simulate FCOPYSIGN.
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 
       // We don't support sin/cos/fmod
       setOperationAction(ISD::FSIN   , VT, Expand);
       setOperationAction(ISD::FCOS   , VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
 
     // Lower this to MOVMSK plus an AND.
     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 
     // Expand FP immediates into loads from the stack, except for the special
     // cases we handle.
     addLegalFPImmediate(APFloat(+0.0)); // xorpd
     addLegalFPImmediate(APFloat(+0.0f)); // xorps
   } else if (UseX87 && X86ScalarSSEf32) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, &X86::FR32RegClass);
     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 
     // Use ANDPS to simulate FABS.
     setOperationAction(ISD::FABS , MVT::f32, Custom);
 
     // Use XORP to simulate FNEG.
     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 
     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 
     // Use ANDPS and ORPS to simulate FCOPYSIGN.
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
     // We don't support sin/cos/fmod
     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
     // Special cases we handle for FP constants.
     addLegalFPImmediate(APFloat(+0.0f)); // xorps
     addLegalFPImmediate(APFloat(+0.0)); // FLD0
     addLegalFPImmediate(APFloat(+1.0)); // FLD1
     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 
     // Always expand sin/cos functions even though x87 has an instruction.
     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
   } else if (UseX87) {
     // f32 and f64 in x87.
     // Set up the FP register classes.
     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 
     for (auto VT : { MVT::f32, MVT::f64 }) {
       setOperationAction(ISD::UNDEF,     VT, Expand);
       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 
       // Always expand sin/cos functions even though x87 has an instruction.
       setOperationAction(ISD::FSIN   , VT, Expand);
       setOperationAction(ISD::FCOS   , VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
     addLegalFPImmediate(APFloat(+0.0)); // FLD0
     addLegalFPImmediate(APFloat(+1.0)); // FLD1
     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
   }
 
   // We don't support FMA.
   setOperationAction(ISD::FMA, MVT::f64, Expand);
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
   // Long double always uses X87, except f128 in MMX.
   if (UseX87) {
     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
       addRegisterClass(MVT::f128, &X86::FR128RegClass);
       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
       setOperationAction(ISD::FABS , MVT::f128, Custom);
       setOperationAction(ISD::FNEG , MVT::f128, Custom);
       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
     }
 
     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
     {
       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
       addLegalFPImmediate(TmpFlt);  // FLD0
       TmpFlt.changeSign();
       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 
       bool ignored;
       APFloat TmpFlt2(+1.0);
       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
                       &ignored);
       addLegalFPImmediate(TmpFlt2);  // FLD1
       TmpFlt2.changeSign();
       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
     }
 
     // Always expand sin/cos functions even though x87 has an instruction.
     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 
     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
     setOperationAction(ISD::FMA, MVT::f80, Expand);
   }
 
   // Always use a library call for pow.
   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 
   setOperationAction(ISD::FLOG, MVT::f80, Expand);
   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
   setOperationAction(ISD::FEXP, MVT::f80, Expand);
   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 
   // Some FP actions are always expanded for vector types.
   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
     setOperationAction(ISD::FSIN,      VT, Expand);
     setOperationAction(ISD::FSINCOS,   VT, Expand);
     setOperationAction(ISD::FCOS,      VT, Expand);
     setOperationAction(ISD::FREM,      VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::FPOW,      VT, Expand);
     setOperationAction(ISD::FLOG,      VT, Expand);
     setOperationAction(ISD::FLOG2,     VT, Expand);
     setOperationAction(ISD::FLOG10,    VT, Expand);
     setOperationAction(ISD::FEXP,      VT, Expand);
     setOperationAction(ISD::FEXP2,     VT, Expand);
   }
 
   // First set operation action for all vector types to either promote
   // (for widening) or expand (for scalarization). Then we will selectively
   // turn on ones that can be effectively codegen'd.
   for (MVT VT : MVT::vector_valuetypes()) {
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
     setOperationAction(ISD::FMA,  VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
     setOperationAction(ISD::FCEIL, VT, Expand);
     setOperationAction(ISD::FTRUNC, VT, Expand);
     setOperationAction(ISD::FRINT, VT, Expand);
     setOperationAction(ISD::FNEARBYINT, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Expand);
     setOperationAction(ISD::CTTZ, VT, Expand);
     setOperationAction(ISD::CTLZ, VT, Expand);
     setOperationAction(ISD::ROTL, VT, Expand);
     setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::BSWAP, VT, Expand);
     setOperationAction(ISD::SETCC, VT, Expand);
     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
     setOperationAction(ISD::TRUNCATE, VT, Expand);
     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
     for (MVT InnerVT : MVT::vector_valuetypes()) {
       setTruncStoreAction(InnerVT, VT, Expand);
 
       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 
       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
       // types, we have to deal with them whether we ask for Expansion or not.
       // Setting Expand causes its own optimisation problems though, so leave
       // them legal.
       if (VT.getVectorElementType() == MVT::i1)
         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 
       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
       // split/scalarized right now.
       if (VT.getVectorElementType() == MVT::f16)
         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
     }
   }
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
   // with -msoft-float, disable use of MMX as well.
   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
     // No operations on x86mmx supported, everything uses intrinsics.
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
     // registers cannot be used even for integer operations.
     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 
     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
 
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::CTPOP,              VT, Custom);
       setOperationAction(ISD::CTTZ,               VT, Custom);
     }
 
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
     // We support custom legalizing of sext and anyext loads for specific
     // memory vector types which we can load as a scalar (or sequence of
     // scalars) and extend in-register to a legal 128-bit vector type. For sext
     // loads these must work with a single scalar load.
     for (MVT VT : MVT::integer_vector_valuetypes()) {
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
     }
 
     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
 
       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
         continue;
 
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
       setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
       setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
       setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
       setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
     }
 
     // Custom lower v2i64 and v2f64 selects.
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
 
     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
 
     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
 
     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 
     // In the customized shift lowering, the legal v4i32/v2i64 cases
     // in AVX2 will be recognized.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SRL,              VT, Custom);
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
     }
 
     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
     // We directly match byte blends in the backend as they match the VSELECT
     // condition form.
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 
     // SSE41 brings specific instructions for doing vector sign extend even in
     // cases where we don't have SRA.
     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
     }
 
     for (MVT VT : MVT::integer_vector_valuetypes()) {
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
     }
 
     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
     }
 
     // i8 vectors are custom because the source register and source
     // source memory operand types are not the same width.
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
       setOperationAction(ISD::ROTL, VT, Custom);
 
     // XOP can efficiently perform BITREVERSE with VPPERM.
     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
       setOperationAction(ISD::BITREVERSE, VT, Custom);
 
     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
       setOperationAction(ISD::BITREVERSE, VT, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
     bool HasInt256 = Subtarget.hasInt256();
 
     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
 
     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
       setOperationAction(ISD::FFLOOR,     VT, Legal);
       setOperationAction(ISD::FCEIL,      VT, Legal);
       setOperationAction(ISD::FTRUNC,     VT, Legal);
       setOperationAction(ISD::FRINT,      VT, Legal);
       setOperationAction(ISD::FNEARBYINT, VT, Legal);
       setOperationAction(ISD::FNEG,       VT, Custom);
       setOperationAction(ISD::FABS,       VT, Custom);
       setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
     }
 
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
 
     // In the customized shift lowering, the legal v8i32/v4i64 cases
     // in AVX2 will be recognized.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::SHL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
     }
 
     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
     }
 
     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SETCC,           VT, Custom);
       setOperationAction(ISD::CTPOP,           VT, Custom);
       setOperationAction(ISD::CTTZ,            VT, Custom);
       setOperationAction(ISD::CTLZ,            VT, Custom);
     }
 
     if (Subtarget.hasAnyFMA()) {
       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
                        MVT::v2f64, MVT::v4f64 })
         setOperationAction(ISD::FMA, VT, Legal);
     }
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
     }
 
     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
 
     setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
     setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
 
     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
     }
 
     if (HasInt256) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
 
       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
       // when we have a 256bit-wide blend with immediate.
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
 
       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
       }
     }
 
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
       setOperationAction(ISD::MLOAD,  VT, Legal);
       setOperationAction(ISD::MSTORE, VT, Legal);
     }
 
     // Extract subvector is special because the value type
     // (result) is 128-bit but the source is 256-bit wide.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
                      MVT::v4f32, MVT::v2f64 }) {
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
     }
 
     // Custom lower several nodes for 256-bit types.
     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
                     MVT::v8f32, MVT::v4f64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
     if (HasInt256)
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
 
     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
       setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
       setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
       setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
       setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
     }
 
     if (HasInt256) {
       // Custom legalize 2x32 to get a little better code.
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
 
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
         setOperationAction(ISD::MGATHER,  VT, Custom);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
 
     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
 
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32);
     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32);
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1,  MVT::v8i32);
     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1,  MVT::v8i32);
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1,  MVT::v4i32);
     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1,  MVT::v4i32);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
 
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
     if (Subtarget.hasVLX()) {
       setOperationAction(ISD::FP_TO_SINT,         MVT::v2i1,  Custom);
       setOperationAction(ISD::FP_TO_UINT,         MVT::v2i1,  Custom);
     }
 
     // Extends of v16i1/v8i1 to 128-bit vectors.
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i8, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i16, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i16, Custom);
 
     for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
       setOperationAction(ISD::ADD,              VT, Custom);
       setOperationAction(ISD::SUB,              VT, Custom);
       setOperationAction(ISD::MUL,              VT, Custom);
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::SELECT,           VT, Custom);
       setOperationAction(ISD::TRUNCATE,         VT, Custom);
 
       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
       setOperationAction(ISD::VSELECT,          VT,  Expand);
     }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1,  Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
     for (auto VT : { MVT::v1i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
 
     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
     }
 
     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
                    MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
       setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
       setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
       setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
       setTruncStoreAction(VT, MaskVT, Custom);
     }
 
     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FNEG,  VT, Custom);
       setOperationAction(ISD::FABS,  VT, Custom);
       setOperationAction(ISD::FMA,   VT, Legal);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
     }
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
 
     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
 
     if (!Subtarget.hasVLX()) {
       // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
       // to 512-bit rather than use the AVX2 instructions so that we can use
       // k-masks.
       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
         setOperationAction(ISD::MLOAD,  VT, Custom);
         setOperationAction(ISD::MSTORE, VT, Custom);
       }
     }
 
     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
 
     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FFLOOR,           VT, Legal);
       setOperationAction(ISD::FCEIL,            VT, Legal);
       setOperationAction(ISD::FTRUNC,           VT, Legal);
       setOperationAction(ISD::FRINT,            VT, Legal);
       setOperationAction(ISD::FNEARBYINT,       VT, Legal);
     }
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
 
     // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
 
     setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
 
     setOperationAction(ISD::UMUL_LOHI,          MVT::v16i32,  Custom);
     setOperationAction(ISD::SMUL_LOHI,          MVT::v16i32,  Custom);
 
     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
 
     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
       setOperationAction(ISD::SMAX,             VT, Legal);
       setOperationAction(ISD::UMAX,             VT, Legal);
       setOperationAction(ISD::SMIN,             VT, Legal);
       setOperationAction(ISD::UMIN,             VT, Legal);
       setOperationAction(ISD::ABS,              VT, Legal);
       setOperationAction(ISD::SRL,              VT, Custom);
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
       setOperationAction(ISD::CTPOP,            VT, Custom);
       setOperationAction(ISD::CTTZ,             VT, Custom);
       setOperationAction(ISD::ROTL,             VT, Custom);
       setOperationAction(ISD::ROTR,             VT, Custom);
     }
 
     // Need to promote to 64-bit even though we have 32-bit masked instructions
     // because the IR optimizers rearrange bitcasts around logic ops leaving
     // too many variations to handle if we don't promote them.
     setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
     setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
     setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
 
     if (Subtarget.hasDQI()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
       setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
       setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
     }
 
     if (Subtarget.hasCDI()) {
       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
         setOperationAction(ISD::CTLZ,            VT, Legal);
         setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
       }
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasVPOPCNTDQ()) {
       for (auto VT : { MVT::v16i32, MVT::v8i64 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
 
     // Extract subvector is special because the value type
     // (result) is 256-bit but the source is 512-bit wide.
     // 128-bit was made Legal under AVX1.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
                      MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 
     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
       setOperationAction(ISD::VSELECT,             VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
       setOperationAction(ISD::MLOAD,               VT, Legal);
       setOperationAction(ISD::MSTORE,              VT, Legal);
       setOperationAction(ISD::MGATHER,             VT, Custom);
       setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
     }
   }// has  AVX-512
 
   if (!Subtarget.useSoftFloat() &&
       (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
     // These operations are handled on non-VLX by artificially widening in
     // isel patterns.
     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
 
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
 
     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
       setOperationAction(ISD::SMAX, VT, Legal);
       setOperationAction(ISD::UMAX, VT, Legal);
       setOperationAction(ISD::SMIN, VT, Legal);
       setOperationAction(ISD::UMIN, VT, Legal);
       setOperationAction(ISD::ABS,  VT, Legal);
     }
 
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
       setOperationAction(ISD::ROTL,     VT, Custom);
       setOperationAction(ISD::ROTR,     VT, Custom);
     }
 
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
       setOperationAction(ISD::MSCATTER, VT, Custom);
 
     if (Subtarget.hasDQI()) {
       for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
         setOperationAction(ISD::SINT_TO_FP,     VT, Legal);
         setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
         setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
         setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
       }
     }
 
     if (Subtarget.hasCDI()) {
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
         setOperationAction(ISD::CTLZ,            VT, Legal);
         setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
       }
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasVPOPCNTDQ()) {
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
 
     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
 
     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
       setOperationAction(ISD::ADD,                VT, Custom);
       setOperationAction(ISD::SUB,                VT, Custom);
       setOperationAction(ISD::MUL,                VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Expand);
 
       setOperationAction(ISD::TRUNCATE,           VT, Custom);
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
     }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
     for (auto VT : { MVT::v16i1, MVT::v32i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
     // Extends from v32i1 masks to 256-bit vectors.
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
     // Extends from v64i1 masks to 512-bit vectors.
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
 
     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
     setOperationAction(ISD::MULHS,              MVT::v64i8, Custom);
     setOperationAction(ISD::MULHU,              MVT::v64i8, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
 
     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
 
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VSELECT,      VT, Custom);
       setOperationAction(ISD::ABS,          VT, Legal);
       setOperationAction(ISD::SRL,          VT, Custom);
       setOperationAction(ISD::SHL,          VT, Custom);
       setOperationAction(ISD::SRA,          VT, Custom);
       setOperationAction(ISD::MLOAD,        VT, Legal);
       setOperationAction(ISD::MSTORE,       VT, Legal);
       setOperationAction(ISD::CTPOP,        VT, Custom);
       setOperationAction(ISD::CTTZ,         VT, Custom);
       setOperationAction(ISD::CTLZ,         VT, Custom);
       setOperationAction(ISD::SMAX,         VT, Legal);
       setOperationAction(ISD::UMAX,         VT, Legal);
       setOperationAction(ISD::SMIN,         VT, Legal);
       setOperationAction(ISD::UMIN,         VT, Legal);
 
       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
       setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
     }
 
     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
     }
 
     if (Subtarget.hasBITALG()) {
       for (auto VT : { MVT::v64i8, MVT::v32i16 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
       (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
     }
 
     // These operations are handled on non-VLX by artificially widening in
     // isel patterns.
     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
 
     if (Subtarget.hasBITALG()) {
       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
 
     for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
       setOperationAction(ISD::ADD,                VT, Custom);
       setOperationAction(ISD::SUB,                VT, Custom);
       setOperationAction(ISD::MUL,                VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Expand);
 
       setOperationAction(ISD::TRUNCATE,           VT, Custom);
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
     }
 
     // TODO: v8i1 concat should be legal without VLX to support concats of
     // v1i1, but we won't legalize it correctly currently without introducing
     // a v4i1 concat in the middle.
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
     for (auto VT : { MVT::v2i1, MVT::v4i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
     // Extends from v2i1/v4i1 masks to 128-bit vectors.
     setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
     setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,      MVT::v4i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND,      MVT::v2i64, Custom);
     setOperationAction(ISD::ANY_EXTEND,       MVT::v4i32, Custom);
     setOperationAction(ISD::ANY_EXTEND,       MVT::v2i64, Custom);
 
     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
 
     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
 
     if (Subtarget.hasDQI()) {
       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
       // v2f32 UINT_TO_FP is already custom under SSE2.
       setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
              "Unexpected operation action!");
       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
       setOperationAction(ISD::FP_TO_SINT,    MVT::v2f32, Custom);
       setOperationAction(ISD::FP_TO_UINT,    MVT::v2f32, Custom);
     }
 
     if (Subtarget.hasBWI()) {
       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
     }
   }
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   if (!Subtarget.is64Bit()) {
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   }
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
   //
   // FIXME: We really should do custom legalization for addition and
   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   // than generic legalization for 64-bit multiplication-with-overflow, though.
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     // Add/Sub/Mul with overflow operations are custom lowered.
     setOperationAction(ISD::SADDO, VT, Custom);
     setOperationAction(ISD::UADDO, VT, Custom);
     setOperationAction(ISD::SSUBO, VT, Custom);
     setOperationAction(ISD::USUBO, VT, Custom);
     setOperationAction(ISD::SMULO, VT, Custom);
     setOperationAction(ISD::UMULO, VT, Custom);
 
     // Support carry in as value rather than glue.
     setOperationAction(ISD::ADDCARRY, VT, Custom);
     setOperationAction(ISD::SUBCARRY, VT, Custom);
     setOperationAction(ISD::SETCCCARRY, VT, Custom);
   }
 
   if (!Subtarget.is64Bit()) {
     // These libcalls are not available in 32-bit.
     setLibcallName(RTLIB::SHL_I128, nullptr);
     setLibcallName(RTLIB::SRL_I128, nullptr);
     setLibcallName(RTLIB::SRA_I128, nullptr);
     setLibcallName(RTLIB::MUL_I128, nullptr);
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   }
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
     setOperationAction(ISD::UDIV, MVT::i128, Custom);
     setOperationAction(ISD::SREM, MVT::i128, Custom);
     setOperationAction(ISD::UREM, MVT::i128, Custom);
     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   }
 
   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
   // is. We should promote the value to 64-bits to solve this.
   // This is what the CRT headers do - `fmodf` is an inline header
   // function casting to f64 and calling `fmod`.
   if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
                               Subtarget.isTargetWindowsItanium()))
     for (ISD::NodeType Op :
          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
       if (isOperationExpand(Op, MVT::f32))
         setOperationAction(Op, MVT::f32, Promote);
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
   setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
   setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::MLOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::MSTORE);
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::MSCATTER);
   setTargetDAGCombine(ISD::MGATHER);
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
 
   // TODO: These control memcmp expansion in CGP and could be raised higher, but
   // that needs to benchmarked and balanced with the potential use of vector
   // load/store types (PR33329, PR33914).
   MaxLoadsPerMemcmp = 2;
   MaxLoadsPerMemcmpOptSize = 2;
 
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
   // An out-of-order CPU can speculatively execute past a predictable branch,
   // but a conditional move could be stalled by an expensive earlier operation.
   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
   EnableExtLdPromotion = true;
   setPrefFunctionAlignment(4); // 2^4 bytes.
 
   verifyIntrinsicTables();
 }
 
 // This has so far only been implemented for 64-bit MachO.
 bool X86TargetLowering::useLoadStackGuardNode() const {
   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
 }
 
 bool X86TargetLowering::useStackGuardXorFP() const {
   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
   return Subtarget.getTargetTriple().isOSMSVCRT();
 }
 
 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
                                                const SDLoc &DL) const {
   EVT PtrTy = getPointerTy(DAG.getDataLayout());
   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
   return SDValue(Node, 0);
 }
 
 TargetLoweringBase::LegalizeTypeAction
 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
   if (ExperimentalVectorWideningLegalization &&
       VT.getVectorNumElements() != 1 &&
       VT.getVectorElementType().getSimpleVT() != MVT::i1)
     return TypeWidenVector;
 
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
                                           LLVMContext& Context,
                                           EVT VT) const {
   if (!VT.isVector())
     return MVT::i8;
 
   if (Subtarget.hasAVX512()) {
     const unsigned NumElts = VT.getVectorNumElements();
 
     // Figure out what this type will be legalized to.
     EVT LegalVT = VT;
     while (getTypeAction(Context, LegalVT) != TypeLegal)
       LegalVT = getTypeToTransformTo(Context, LegalVT);
 
     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
     if (LegalVT.getSimpleVT().is512BitVector())
       return EVT::getVectorVT(Context, MVT::i1, NumElts);
 
     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
       // If we legalized to less than a 512-bit vector, then we will use a vXi1
       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
       // vXi16/vXi8.
       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
         return EVT::getVectorVT(Context, MVT::i1, NumElts);
     }
   }
 
   return VT.changeVectorElementTypeToInteger();
 }
 
 /// Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   if (MaxAlign == 16)
     return;
   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
     if (VTy->getBitWidth() == 128)
       MaxAlign = 16;
   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     unsigned EltAlign = 0;
     getMaxByValAlign(ATy->getElementType(), EltAlign);
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
     for (auto *EltTy : STy->elements()) {
       unsigned EltAlign = 0;
       getMaxByValAlign(EltTy, EltAlign);
       if (EltAlign > MaxAlign)
         MaxAlign = EltAlign;
       if (MaxAlign == 16)
         break;
     }
   }
 }
 
 /// Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area. For X86, aggregates
 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
 /// are at 4-byte boundaries.
 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
                                                   const DataLayout &DL) const {
   if (Subtarget.is64Bit()) {
     // Max of 8 and alignment of type.
     unsigned TyAlign = DL.getABITypeAlignment(Ty);
     if (TyAlign > 8)
       return TyAlign;
     return 8;
   }
 
   unsigned Align = 4;
   if (Subtarget.hasSSE1())
     getMaxByValAlign(Ty, Align);
   return Align;
 }
 
 /// Returns the target specific optimal type for load
 /// and store operations as a result of memset, memcpy, and memmove
 /// lowering. If DstAlign is zero that means it's safe to destination
 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
 /// means there isn't a need to check it against alignment requirement,
 /// probably because the source does not need to be loaded. If 'IsMemset' is
 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
 /// source is constant so it does not need to be loaded.
 /// It returns EVT::Other if the type should be determined using generic
 /// target-independent logic.
 EVT
 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                        unsigned DstAlign, unsigned SrcAlign,
                                        bool IsMemset, bool ZeroMemset,
                                        bool MemcpyStrSrc,
                                        MachineFunction &MF) const {
   const Function &F = MF.getFunction();
   if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
     if (Size >= 16 &&
         (!Subtarget.isUnalignedMem16Slow() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
           (SrcAlign == 0 || SrcAlign >= 16)))) {
       // FIXME: Check if unaligned 32-byte accesses are slow.
       if (Size >= 32 && Subtarget.hasAVX()) {
         // Although this isn't a well-supported type for AVX1, we'll let
         // legalization and shuffle lowering produce the optimal codegen. If we
         // choose an optimal type with a vector element larger than a byte,
         // getMemsetStores() may create an intermediate splat (using an integer
         // multiply) before we splat as a vector.
         return MVT::v32i8;
       }
       if (Subtarget.hasSSE2())
         return MVT::v16i8;
       // TODO: Can SSE1 handle a byte vector?
       if (Subtarget.hasSSE1())
         return MVT::v4f32;
     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
       // Do not use f64 to lower memcpy if source is string constant. It's
       // better to use i32 to avoid the loads.
       // Also, do not use f64 to lower memset unless this is a memset of zeros.
       // The gymnastics of splatting a byte value into an XMM register and then
       // only using 8-byte stores (because this is a CPU with slow unaligned
       // 16-byte accesses) makes that a loser.
       return MVT::f64;
     }
   }
   // This is a compromise. If we reach here, unaligned accesses may be slow on
   // this target. However, creating smaller, aligned accesses could be even
   // slower and would certainly be a lot more code.
   if (Subtarget.is64Bit() && Size >= 8)
     return MVT::i64;
   return MVT::i32;
 }
 
 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   if (VT == MVT::f32)
     return X86ScalarSSEf32;
   else if (VT == MVT::f64)
     return X86ScalarSSEf64;
   return true;
 }
 
 bool
 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                   unsigned,
                                                   unsigned,
                                                   bool *Fast) const {
   if (Fast) {
     switch (VT.getSizeInBits()) {
     default:
       // 8-byte and under are always assumed to be fast.
       *Fast = true;
       break;
     case 128:
       *Fast = !Subtarget.isUnalignedMem16Slow();
       break;
     case 256:
       *Fast = !Subtarget.isUnalignedMem32Slow();
       break;
     // TODO: What about AVX-512 (512-bit) accesses?
     }
   }
   // Misaligned accesses of any size are always allowed.
   return true;
 }
 
 /// Return the entry encoding for a jump table in the
 /// current function.  The returned value is a member of the
 /// MachineJumpTableInfo::JTEntryKind enum.
 unsigned X86TargetLowering::getJumpTableEncoding() const {
   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   // symbol.
   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
     return MachineJumpTableInfo::EK_Custom32;
 
   // Otherwise, use the normal jump table encoding heuristics.
   return TargetLowering::getJumpTableEncoding();
 }
 
 bool X86TargetLowering::useSoftFloat() const {
   return Subtarget.useSoftFloat();
 }
 
 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
                                               ArgListTy &Args) const {
 
   // Only relabel X86-32 for C / Stdcall CCs.
   if (Subtarget.is64Bit())
     return;
   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
     return;
   unsigned ParamRegs = 0;
   if (auto *M = MF->getFunction().getParent())
     ParamRegs = M->getNumberRegisterParameters();
 
   // Mark the first N int arguments as having reg
   for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
     Type *T = Args[Idx].Ty;
     if (T->isPointerTy() || T->isIntegerTy())
       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
         unsigned numRegs = 1;
         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
           numRegs = 2;
         if (ParamRegs < numRegs)
           return;
         ParamRegs -= numRegs;
         Args[Idx].IsInReg = true;
       }
   }
 }
 
 const MCExpr *
 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                              const MachineBasicBlock *MBB,
                                              unsigned uid,MCContext &Ctx) const{
   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   // entries.
   return MCSymbolRefExpr::create(MBB->getSymbol(),
                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
 }
 
 /// Returns relocation base for the given PIC jumptable.
 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                     SelectionDAG &DAG) const {
   if (!Subtarget.is64Bit())
     // This doesn't have SDLoc associated with it, but is not really the
     // same as a Register.
     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
                        getPointerTy(DAG.getDataLayout()));
   return Table;
 }
 
 /// This returns the relocation base for the given PIC jumptable,
 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
 const MCExpr *X86TargetLowering::
 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
                              MCContext &Ctx) const {
   // X86-64 uses RIP relative addressing based on the jump table label.
   if (Subtarget.isPICStyleRIPRel())
     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
 
   // Otherwise, the reference is relative to the PIC base.
   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
 }
 
 std::pair<const TargetRegisterClass *, uint8_t>
 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
                                            MVT VT) const {
   const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
     return TargetLowering::findRepresentativeClass(TRI, VT);
   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
     break;
   case MVT::x86mmx:
     RRC = &X86::VR64RegClass;
     break;
   case MVT::f32: case MVT::f64:
   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   case MVT::v4f32: case MVT::v2f64:
   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
   case MVT::v8f32: case MVT::v4f64:
   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
   case MVT::v16f32: case MVT::v8f64:
     RRC = &X86::VR128XRegClass;
     break;
   }
   return std::make_pair(RRC, Cost);
 }
 
 unsigned X86TargetLowering::getAddressSpace() const {
   if (Subtarget.is64Bit())
     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
   return 256;
 }
 
 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
 }
 
 static Constant* SegmentOffset(IRBuilder<> &IRB,
                                unsigned Offset, unsigned AddressSpace) {
   return ConstantExpr::getIntToPtr(
       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
 }
 
 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
   // tcbhead_t; use it instead of the usual global variable (see
   // sysdeps/{i386,x86_64}/nptl/tls.h)
   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
     if (Subtarget.isTargetFuchsia()) {
       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
       return SegmentOffset(IRB, 0x10, getAddressSpace());
     } else {
       // %fs:0x28, unless we're using a Kernel code model, in which case
       // it's %gs:0x28.  gs:0x14 on i386.
       unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
       return SegmentOffset(IRB, Offset, getAddressSpace());
     }
   }
 
   return TargetLowering::getIRStackGuard(IRB);
 }
 
 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   // MSVC CRT provides functionalities for stack protection.
   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
     // MSVC CRT has a global variable holding security cookie.
     M.getOrInsertGlobal("__security_cookie",
                         Type::getInt8PtrTy(M.getContext()));
 
     // MSVC CRT has a function to validate security cookie.
     auto *SecurityCheckCookie = cast<Function>(
         M.getOrInsertFunction("__security_check_cookie",
                               Type::getVoidTy(M.getContext()),
                               Type::getInt8PtrTy(M.getContext())));
     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
     return;
   }
   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
     return;
   TargetLowering::insertSSPDeclarations(M);
 }
 
 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
   // MSVC CRT has a global variable holding security cookie.
   if (Subtarget.getTargetTriple().isOSMSVCRT())
     return M.getGlobalVariable("__security_cookie");
   return TargetLowering::getSDagStackGuard(M);
 }
 
 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
   // MSVC CRT has a function to validate security cookie.
   if (Subtarget.getTargetTriple().isOSMSVCRT())
     return M.getFunction("__security_check_cookie");
   return TargetLowering::getSSPStackGuardCheck(M);
 }
 
 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   if (Subtarget.getTargetTriple().isOSContiki())
     return getDefaultSafeStackPointerLocation(IRB, false);
 
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
   if (Subtarget.isTargetAndroid()) {
     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
     // %gs:0x24 on i386
     unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
     return SegmentOffset(IRB, Offset, getAddressSpace());
   }
 
   // Fuchsia is similar.
   if (Subtarget.isTargetFuchsia()) {
     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
     return SegmentOffset(IRB, 0x18, getAddressSpace());
   }
 
   return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   assert(SrcAS != DestAS && "Expected different address spaces!");
 
   return SrcAS < 256 && DestAS < 256;
 }
 
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
 #include "X86GenCallingConv.inc"
 
 bool X86TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   return ScratchRegs;
 }
 
 /// Lowers masks values (v*i1) to the local register values
 /// \returns DAG node after lowering to register type
 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
                                const SDLoc &Dl, SelectionDAG &DAG) {
   EVT ValVT = ValArg.getValueType();
 
   if (ValVT == MVT::v1i1)
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
                        DAG.getIntPtrConstant(0, Dl));
 
   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
     // Two stage lowering might be required
     // bitcast:   v8i1 -> i8 / v16i1 -> i16
     // anyextend: i8   -> i32 / i16   -> i32
     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
     if (ValLoc == MVT::i32)
       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
     return ValToCopy;
   } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
              (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
     // One stage lowering is required
     // bitcast:   v32i1 -> i32 / v64i1 -> i64
     return DAG.getBitcast(ValLoc, ValArg);
   } else
     return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
 }
 
 /// Breaks v64i1 value into two registers and adds the new node to the DAG
 static void Passv64i1ArgInRegs(
     const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
          "The value should reside in two registers");
 
   // Before splitting the value we cast it to i64
   Arg = DAG.getBitcast(MVT::i64, Arg);
 
   // Splitting the value into two i32 types
   SDValue Lo, Hi;
   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
                    DAG.getConstant(0, Dl, MVT::i32));
   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
                    DAG.getConstant(1, Dl, MVT::i32));
 
   // Attach the two i32 types into corresponding registers
   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
 }
 
 SDValue
 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
                                const SDLoc &dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   // In some cases we need to disable registers from the default CSR list.
   // For example, when they are used for argument passing.
   bool ShouldDisableCalleeSavedRegister =
       CallConv == CallingConv::X86_RegCall ||
       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
 
   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
     report_fatal_error("X86 interrupts may not return any value");
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
   SDValue Flag;
   SmallVector<SDValue, 6> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   // Operand #1 = Bytes To Pop
   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
                    MVT::i32));
 
   // Copy the result values into the output registers.
   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
        ++I, ++OutsIndex) {
     CCValAssign &VA = RVLocs[I];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     // Add the register to the CalleeSaveDisableRegs list.
     if (ShouldDisableCalleeSavedRegister)
       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
 
     SDValue ValToCopy = OutVals[OutsIndex];
     EVT ValVT = ValToCopy.getValueType();
 
     // Promote values to the appropriate types.
     if (VA.getLocInfo() == CCValAssign::SExt)
       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
     else if (VA.getLocInfo() == CCValAssign::ZExt)
       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
     else if (VA.getLocInfo() == CCValAssign::AExt) {
       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
       else
         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
     }
     else if (VA.getLocInfo() == CCValAssign::BCvt)
       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
 
     assert(VA.getLocInfo() != CCValAssign::FPExt &&
            "Unexpected FP-extend for return value.");
 
     // If this is x86-64, and we disabled SSE, we can't return FP values,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
         (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     } else if (ValVT == MVT::f64 &&
                (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
       // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
       // llvm-gcc has never done it right and no one has noticed, so this
       // should be OK for now.
       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
     if (VA.getLocReg() == X86::FP0 ||
         VA.getLocReg() == X86::FP1) {
       // If this is a copy from an xmm register to ST(0), use an FPExtend to
       // change the value to the FP stack register class.
       if (isScalarFPTypeInSSEReg(VA.getValVT()))
         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
       RetOps.push_back(ValToCopy);
       // Don't emit a copytoreg.
       continue;
     }
 
     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
     // which is returned in RAX / RDX.
     if (Subtarget.is64Bit()) {
       if (ValVT == MVT::x86mmx) {
         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                                   ValToCopy);
           // If we don't have SSE2 available, convert to v4f32 so the generated
           // register is legal.
           if (!Subtarget.hasSSE2())
             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
         }
       }
     }
 
     SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
 
       Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
                          Subtarget);
 
       assert(2 == RegsToPass.size() &&
              "Expecting two registers after Pass64BitArgInRegs");
 
       // Add the second register to the CalleeSaveDisableRegs list.
       if (ShouldDisableCalleeSavedRegister)
         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
     } else {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
     }
 
     // Add nodes to the DAG and add the values into the RetOps list
     for (auto &Reg : RegsToPass) {
       Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
       Flag = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
     }
   }
 
   // Swift calling convention does not require we copy the sret argument
   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
 
   // All x86 ABIs require that for returning structs by value we copy
   // the sret argument into %rax/%eax (depending on ABI) for the return.
   // We saved the argument into a virtual register in the entry block,
   // so now we copy the value out and into %rax/%eax.
   //
   // Checking Function.hasStructRetAttr() here is insufficient because the IR
   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   // either case FuncInfo->setSRetReturnReg() will have been called.
   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
     // When we have both sret and another return value, we should use the
     // original Chain stored in RetOps[0], instead of the current Chain updated
     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
 
     // For the case of sret and another return value, we have
     //   Chain_0 at the function entry
     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
     // If we use Chain_1 in getCopyFromReg, we will have
     //   Val = getCopyFromReg(Chain_1)
     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
 
     // getCopyToReg(Chain_0) will be glued together with
     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
     //   Data dependency from Unit B to Unit A due to usage of Val in
     //     getCopyToReg(Chain_1, Val)
     //   Chain dependency from Unit A to Unit B
 
     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
                                      getPointerTy(MF.getDataLayout()));
 
     unsigned RetValReg
         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
           X86::RAX : X86::EAX;
     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
     Flag = Chain.getValue(1);
 
     // RAX/EAX now acts like a return value.
     RetOps.push_back(
         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
 
     // Add the returned register to the CalleeSaveDisableRegs list.
     if (ShouldDisableCalleeSavedRegister)
       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
   }
 
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   const MCPhysReg *I =
       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
   if (I) {
     for (; *I; ++I) {
       if (X86::GR64RegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
       else
         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
     }
   }
 
   RetOps[0] = Chain;  // Update chain.
 
   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
   if (CallConv == CallingConv::X86_INTR)
     opcode = X86ISD::IRET;
   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
 }
 
 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
     return false;
 
   SDValue TCChain = Chain;
   SDNode *Copy = *N->use_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
       return false;
     TCChain = Copy->getOperand(0);
   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
     return false;
 
   bool HasRet = false;
   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
        UI != UE; ++UI) {
     if (UI->getOpcode() != X86ISD::RET_FLAG)
       return false;
     // If we are returning more than one value, we can definitely
     // not make a tail call see PR19530
     if (UI->getNumOperands() > 4)
       return false;
     if (UI->getNumOperands() == 4 &&
         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
       return false;
     HasRet = true;
   }
 
   if (!HasRet)
     return false;
 
   Chain = TCChain;
   return true;
 }
 
 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
                                            ISD::NodeType ExtendKind) const {
   MVT ReturnMVT = MVT::i32;
 
   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
     // The ABI does not require i1, i8 or i16 to be extended.
     //
     // On Darwin, there is code in the wild relying on Clang's old behaviour of
     // always extending i8/i16 return values, so keep doing that for now.
     // (PR26665).
     ReturnMVT = MVT::i8;
   }
 
   EVT MinVT = getRegisterType(Context, ReturnMVT);
   return VT.bitsLT(MinVT) ? MinVT : VT;
 }
 
 /// Reads two 32 bit registers and creates a 64 bit mask value.
 /// \param VA The current 32 bit value that need to be assigned.
 /// \param NextVA The next 32 bit value that need to be assigned.
 /// \param Root The parent DAG node.
 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
 ///                        glue purposes. In the case the DAG is already using
 ///                        physical register instead of virtual, we should glue
 ///                        our new SDValue to InFlag SDvalue.
 /// \return a new SDvalue of size 64bit.
 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
                                 SDValue &Root, SelectionDAG &DAG,
                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
                                 SDValue *InFlag = nullptr) {
   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
   assert(VA.getValVT() == MVT::v64i1 &&
          "Expecting first location of 64 bit width type");
   assert(NextVA.getValVT() == VA.getValVT() &&
          "The locations should have the same type");
   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
          "The values should reside in two registers");
 
   SDValue Lo, Hi;
   unsigned Reg;
   SDValue ArgValueLo, ArgValueHi;
 
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterClass *RC = &X86::GR32RegClass;
 
   // Read a 32 bit value from the registers
   if (nullptr == InFlag) {
     // When no physical register is present,
     // create an intermediate virtual register
     Reg = MF.addLiveIn(VA.getLocReg(), RC);
     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
   } else {
     // When a physical register is available read the value from it and glue
     // the reads together.
     ArgValueLo =
       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
     *InFlag = ArgValueLo.getValue(2);
     ArgValueHi =
       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
     *InFlag = ArgValueHi.getValue(2);
   }
 
   // Convert the i32 type into v32i1 type
   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
 
   // Convert the i32 type into v32i1 type
   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
 
   // Concatenate the two values together
   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
 }
 
 /// The function will lower a register of various sizes (8/16/32/64)
 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
 /// \returns a DAG node contains the operand after lowering to mask type.
 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
                                const EVT &ValLoc, const SDLoc &Dl,
                                SelectionDAG &DAG) {
   SDValue ValReturned = ValArg;
 
   if (ValVT == MVT::v1i1)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
 
   if (ValVT == MVT::v64i1) {
     // In 32 bit machine, this case is handled by getv64i1Argument
     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
     // In 64 bit machine, There is no need to truncate the value only bitcast
   } else {
     MVT maskLen;
     switch (ValVT.getSimpleVT().SimpleTy) {
     case MVT::v8i1:
       maskLen = MVT::i8;
       break;
     case MVT::v16i1:
       maskLen = MVT::i16;
       break;
     case MVT::v32i1:
       maskLen = MVT::i32;
       break;
     default:
       llvm_unreachable("Expecting a vector of i1 types");
     }
 
     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
   }
   return DAG.getBitcast(ValVT, ValReturned);
 }
 
 /// Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 ///
 SDValue X86TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     uint32_t *RegMask) const {
 
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget.is64Bit();
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
        ++I, ++InsIndex) {
     CCValAssign &VA = RVLocs[I];
     EVT CopyVT = VA.getLocVT();
 
     // In some calling conventions we need to remove the used registers
     // from the register mask.
     if (RegMask) {
       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
     }
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
 
     // If we prefer to use the value in xmm registers, copy it out as f80 and
     // use a truncate to move it from fp stack reg to xmm reg.
     bool RoundAfterCopy = false;
     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
         isScalarFPTypeInSSEReg(VA.getValVT())) {
       if (!Subtarget.hasX87())
         report_fatal_error("X87 register return with X87 disabled");
       CopyVT = MVT::f80;
       RoundAfterCopy = (CopyVT != VA.getLocVT());
     }
 
     SDValue Val;
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
       Val =
           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
     } else {
       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
                   .getValue(1);
       Val = Chain.getValue(0);
       InFlag = Chain.getValue(2);
     }
 
     if (RoundAfterCopy)
       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
                         // This truncation won't change the value.
                         DAG.getIntPtrConstant(1, dl));
 
     if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
       if (VA.getValVT().isVector() &&
           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
       } else
         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
     }
 
     InVals.push_back(Val);
   }
 
   return Chain;
 }
 
 //===----------------------------------------------------------------------===//
 //                C & StdCall & Fast Calling Convention implementation
 //===----------------------------------------------------------------------===//
 //  StdCall calling convention seems to be standard for many Windows' API
 //  routines and around. It differs from C calling convention just a little:
 //  callee should clean up the stack, not caller. Symbols should be also
 //  decorated in some fancy way :) It doesn't support any vector arguments.
 //  For info on fast calling convention see Fast Calling Convention (tail call)
 //  implementation LowerX86_32FastCCCallTo.
 
 /// CallIsStructReturn - Determines whether a call uses struct return
 /// semantics.
 enum StructReturnType {
   NotStructReturn,
   RegStructReturn,
   StackStructReturn
 };
 static StructReturnType
 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
   if (Outs.empty())
     return NotStructReturn;
 
   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   if (!Flags.isSRet())
     return NotStructReturn;
   if (Flags.isInReg() || IsMCU)
     return RegStructReturn;
   return StackStructReturn;
 }
 
 /// Determines whether a function uses struct return semantics.
 static StructReturnType
 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
   if (Ins.empty())
     return NotStructReturn;
 
   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   if (!Flags.isSRet())
     return NotStructReturn;
   if (Flags.isInReg() || IsMCU)
     return RegStructReturn;
   return StackStructReturn;
 }
 
 /// Make a copy of an aggregate at address specified by "Src" to address
 /// "Dst" with size and alignment information specified by the specific
 /// parameter attribute. The copy will be passed as a byval function parameter.
 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                          SDValue Chain, ISD::ArgFlagsTy Flags,
                                          SelectionDAG &DAG, const SDLoc &dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
 
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
                        /*isVolatile*/false, /*AlwaysInline=*/true,
                        /*isTailCall*/false,
                        MachinePointerInfo(), MachinePointerInfo());
 }
 
 /// Return true if the calling convention is one that we can guarantee TCO for.
 static bool canGuaranteeTCO(CallingConv::ID CC) {
   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
           CC == CallingConv::HHVM);
 }
 
 /// Return true if we might ever do TCO for calls with this calling convention.
 static bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   // C calling conventions:
   case CallingConv::C:
   case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
   // Callee pop conventions:
   case CallingConv::X86_ThisCall:
   case CallingConv::X86_StdCall:
   case CallingConv::X86_VectorCall:
   case CallingConv::X86_FastCall:
     return true;
   default:
     return canGuaranteeTCO(CC);
   }
 }
 
 /// Return true if the function is being made into a tailcall target by
 /// changing its ABI.
 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
 }
 
 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   auto Attr =
       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
     return false;
 
   ImmutableCallSite CS(CI);
   CallingConv::ID CalleeCC = CS.getCallingConv();
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
   return true;
 }
 
 SDValue
 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     const SDLoc &dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
                                     MachineFrameInfo &MFI, unsigned i) const {
   // Create the nodes corresponding to a load from this parameter slot.
   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   bool AlwaysUseMutable = shouldGuaranteeTCO(
       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
   MVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // If value is passed by pointer we have address passed instead of the value
   // itself. No need to extend if the mask value and location share the same
   // absolute size.
   bool ExtendedInMem =
       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
 
   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
     ValVT = VA.getLocVT();
   else
     ValVT = VA.getValVT();
 
   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
   // taken by a return address.
   int Offset = 0;
   if (CallConv == CallingConv::X86_INTR) {
     // X86 interrupts may take one or two arguments.
     // On the stack there will be no return address as in regular call.
     // Offset of last argument need to be set to -4/-8 bytes.
     // Where offset of the first argument out of two, should be set to 0 bytes.
     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
     if (Subtarget.is64Bit() && Ins.size() == 2) {
       // The stack pointer needs to be realigned for 64 bit handlers with error
       // code, so the argument offset changes by 8 bytes.
       Offset += 8;
     }
   }
 
   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   // changed with more analysis.
   // In case of tail call optimization mark all arguments mutable. Since they
   // could be overwritten by lowering of arguments in case of a tail call.
   if (Flags.isByVal()) {
     unsigned Bytes = Flags.getByValSize();
     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
     // Adjust SP offset of interrupt parameter.
     if (CallConv == CallingConv::X86_INTR) {
       MFI.setObjectOffset(FI, Offset);
     }
     return DAG.getFrameIndex(FI, PtrVT);
   }
 
   // This is an argument in memory. We might be able to perform copy elision.
   if (Flags.isCopyElisionCandidate()) {
     EVT ArgVT = Ins[i].ArgVT;
     SDValue PartAddr;
     if (Ins[i].PartOffset == 0) {
       // If this is a one-part value or the first part of a multi-part value,
       // create a stack object for the entire argument value type and return a
       // load from our portion of it. This assumes that if the first part of an
       // argument is in memory, the rest will also be in memory.
       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
                                      /*Immutable=*/false);
       PartAddr = DAG.getFrameIndex(FI, PtrVT);
       return DAG.getLoad(
           ValVT, dl, Chain, PartAddr,
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
     } else {
       // This is not the first piece of an argument in memory. See if there is
       // already a fixed stack object including this offset. If so, assume it
       // was created by the PartOffset == 0 branch above and create a load from
       // the appropriate offset into it.
       int64_t PartBegin = VA.getLocMemOffset();
       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
       int FI = MFI.getObjectIndexBegin();
       for (; MFI.isFixedObjectIndex(FI); ++FI) {
         int64_t ObjBegin = MFI.getObjectOffset(FI);
         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
           break;
       }
       if (MFI.isFixedObjectIndex(FI)) {
         SDValue Addr =
             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
         return DAG.getLoad(
             ValVT, dl, Chain, Addr,
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
                                               Ins[i].PartOffset));
       }
     }
   }
 
   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
                                  VA.getLocMemOffset(), isImmutable);
 
   // Set SExt or ZExt flag.
   if (VA.getLocInfo() == CCValAssign::ZExt) {
     MFI.setObjectZExt(FI, true);
   } else if (VA.getLocInfo() == CCValAssign::SExt) {
     MFI.setObjectSExt(FI, true);
   }
 
   // Adjust SP offset of interrupt parameter.
   if (CallConv == CallingConv::X86_INTR) {
     MFI.setObjectOffset(FI, Offset);
   }
 
   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   SDValue Val = DAG.getLoad(
       ValVT, dl, Chain, FIN,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
   return ExtendedInMem
              ? (VA.getValVT().isVector()
                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
              : Val;
 }
 
 // FIXME: Get this from tablegen.
 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
                                                 const X86Subtarget &Subtarget) {
   assert(Subtarget.is64Bit());
 
   if (Subtarget.isCallingConvWin64(CallConv)) {
     static const MCPhysReg GPR64ArgRegsWin64[] = {
       X86::RCX, X86::RDX, X86::R8,  X86::R9
     };
     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
   }
 
   static const MCPhysReg GPR64ArgRegs64Bit[] = {
     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   };
   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
 }
 
 // FIXME: Get this from tablegen.
 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
                                                 CallingConv::ID CallConv,
                                                 const X86Subtarget &Subtarget) {
   assert(Subtarget.is64Bit());
   if (Subtarget.isCallingConvWin64(CallConv)) {
     // The XMM registers which might contain var arg parameters are shadowed
     // in their paired GPR.  So we only need to save the GPR to their home
     // slots.
     // TODO: __vectorcall will change this.
     return None;
   }
 
   const Function &F = MF.getFunction();
   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
   bool isSoftFloat = Subtarget.useSoftFloat();
   assert(!(isSoftFloat && NoImplicitFloatOps) &&
          "SSE register cannot be used when SSE is disabled!");
   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
     // registers.
     return None;
 
   static const MCPhysReg XMMArgRegs64Bit[] = {
     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   };
   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
 }
 
 #ifndef NDEBUG
 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
                           return A.getValNo() < B.getValNo();
                         });
 }
 #endif
 
 SDValue X86TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
 
   const Function &F = MF.getFunction();
   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
       F.getName() == "main")
     FuncInfo->setForceFramePointer(true);
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool Is64Bit = Subtarget.is64Bit();
   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
 
   assert(
       !(isVarArg && canGuaranteeTCO(CallConv)) &&
       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
 
   if (CallConv == CallingConv::X86_INTR) {
     bool isLegal = Ins.size() == 1 ||
                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
     if (!isLegal)
       report_fatal_error("X86 interrupts may take one or two arguments");
   }
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64.
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
   CCInfo.AnalyzeArguments(Ins, CC_X86);
 
   // In vectorcall calling convention a second pass is required for the HVA
   // types.
   if (CallingConv::X86_VectorCall == CallConv) {
     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
   }
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
   assert(isSortedByValueNo(ArgLocs) &&
          "Argument Location list must be sorted before lowering");
 
   SDValue ArgValue;
   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++InsIndex) {
     assert(InsIndex < Ins.size() && "Invalid Ins index");
     CCValAssign &VA = ArgLocs[I];
 
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
       if (VA.needsCustom()) {
         assert(
             VA.getValVT() == MVT::v64i1 &&
             "Currently the only custom case is when we split v64i1 to 2 regs");
 
         // v64i1 values, in regcall calling convention, that are
         // compiled to 32 bit arch, are split up into two registers.
         ArgValue =
             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
       } else {
         const TargetRegisterClass *RC;
         if (RegVT == MVT::i32)
           RC = &X86::GR32RegClass;
         else if (Is64Bit && RegVT == MVT::i64)
           RC = &X86::GR64RegClass;
         else if (RegVT == MVT::f32)
           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
         else if (RegVT == MVT::f64)
           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
         else if (RegVT == MVT::f80)
           RC = &X86::RFP80RegClass;
         else if (RegVT == MVT::f128)
           RC = &X86::FR128RegClass;
         else if (RegVT.is512BitVector())
           RC = &X86::VR512RegClass;
         else if (RegVT.is256BitVector())
           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
         else if (RegVT.is128BitVector())
           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
         else if (RegVT == MVT::x86mmx)
           RC = &X86::VR64RegClass;
         else if (RegVT == MVT::v1i1)
           RC = &X86::VK1RegClass;
         else if (RegVT == MVT::v8i1)
           RC = &X86::VK8RegClass;
         else if (RegVT == MVT::v16i1)
           RC = &X86::VK16RegClass;
         else if (RegVT == MVT::v32i1)
           RC = &X86::VK32RegClass;
         else if (RegVT == MVT::v64i1)
           RC = &X86::VK64RegClass;
         else
           llvm_unreachable("Unknown argument type!");
 
         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       }
 
       // If this is an 8 or 16-bit value, it is really passed promoted to 32
       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
       // right size.
       if (VA.getLocInfo() == CCValAssign::SExt)
         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
                                DAG.getValueType(VA.getValVT()));
       else if (VA.getLocInfo() == CCValAssign::ZExt)
         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                                DAG.getValueType(VA.getValVT()));
       else if (VA.getLocInfo() == CCValAssign::BCvt)
         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
 
       if (VA.isExtInLoc()) {
         // Handle MMX values passed in XMM regs.
         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
         else if (VA.getValVT().isVector() &&
                  VA.getValVT().getScalarType() == MVT::i1 &&
                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
         } else
           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
       }
     } else {
       assert(VA.isMemLoc());
       ArgValue =
           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
     }
 
     // If value is passed via pointer - do a load.
     if (VA.getLocInfo() == CCValAssign::Indirect)
       ArgValue =
           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
 
     InVals.push_back(ArgValue);
   }
 
   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
     // Swift calling convention does not require we copy the sret argument
     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
     if (CallConv == CallingConv::Swift)
       continue;
 
     // All x86 ABIs require that for returning structs by value we copy the
     // sret argument into %rax/%eax (depending on ABI) for the return. Save
     // the argument into a virtual register so that we can access it from the
     // return points.
     if (Ins[I].Flags.isSRet()) {
       unsigned Reg = FuncInfo->getSRetReturnReg();
       if (!Reg) {
         MVT PtrTy = getPointerTy(DAG.getDataLayout());
         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
         FuncInfo->setSRetReturnReg(Reg);
       }
       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
       break;
     }
   }
 
   unsigned StackSize = CCInfo.getNextStackOffset();
   // Align stack specially for tail calls.
   if (shouldGuaranteeTCO(CallConv,
                          MF.getTarget().Options.GuaranteedTailCallOpt))
     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start. We
   // can skip this if there are no va_start calls.
   if (MFI.hasVAStart() &&
       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
                    CallConv != CallingConv::X86_ThisCall))) {
     FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
   }
 
   // Figure out if XMM registers are in use.
   assert(!(Subtarget.useSoftFloat() &&
            F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
          "SSE register cannot be used when SSE is disabled!");
 
   // 64-bit calling conventions support varargs and register parameters, so we
   // have to do extra work to spill them in the prologue.
   if (Is64Bit && isVarArg && MFI.hasVAStart()) {
     // Find the first unallocated argument registers.
     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
            "SSE register cannot be used when SSE is disabled!");
 
     // Gather all the live in physical registers.
     SmallVector<SDValue, 6> LiveGPRs;
     SmallVector<SDValue, 8> LiveXMMRegs;
     SDValue ALVal;
     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
       LiveGPRs.push_back(
           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
     }
     if (!ArgXMMs.empty()) {
       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
         LiveXMMRegs.push_back(
             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
       }
     }
 
     if (IsWin64) {
       // Get to the caller-allocated home save location.  Add 8 to account
       // for the return address.
       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
       FuncInfo->setRegSaveFrameIndex(
           MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
       // Fixup to set vararg frame on shadow area (4 x i64).
       if (NumIntRegs < 4)
         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
     } else {
       // For X86-64, if there are vararg parameters that are passed via
       // registers, then we must store them to their spots on the stack so
       // they may be loaded by dereferencing the result of va_next.
       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
       FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
     }
 
     // Store the integer parameter registers.
     SmallVector<SDValue, 8> MemOps;
     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
                                       getPointerTy(DAG.getDataLayout()));
     unsigned Offset = FuncInfo->getVarArgsGPOffset();
     for (SDValue Val : LiveGPRs) {
       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
       SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
                        MachinePointerInfo::getFixedStack(
                            DAG.getMachineFunction(),
                            FuncInfo->getRegSaveFrameIndex(), Offset));
       MemOps.push_back(Store);
       Offset += 8;
     }
 
     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
       // Now store the XMM (fp + vector) parameter registers.
       SmallVector<SDValue, 12> SaveXMMOps;
       SaveXMMOps.push_back(Chain);
       SaveXMMOps.push_back(ALVal);
       SaveXMMOps.push_back(DAG.getIntPtrConstant(
                              FuncInfo->getRegSaveFrameIndex(), dl));
       SaveXMMOps.push_back(DAG.getIntPtrConstant(
                              FuncInfo->getVarArgsFPOffset(), dl));
       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
                         LiveXMMRegs.end());
       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
                                    MVT::Other, SaveXMMOps));
     }
 
     if (!MemOps.empty())
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   }
 
   if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
     // Find the largest legal vector type.
     MVT VecVT = MVT::Other;
     // FIXME: Only some x86_32 calling conventions support AVX512.
     if (Subtarget.hasAVX512() &&
         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
                      CallConv == CallingConv::Intel_OCL_BI)))
       VecVT = MVT::v16f32;
     else if (Subtarget.hasAVX())
       VecVT = MVT::v8f32;
     else if (Subtarget.hasSSE2())
       VecVT = MVT::v4f32;
 
     // We forward some GPRs and some vector types.
     SmallVector<MVT, 2> RegParmTypes;
     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
     RegParmTypes.push_back(IntVT);
     if (VecVT != MVT::Other)
       RegParmTypes.push_back(VecVT);
 
     // Compute the set of forwarded registers. The rest are scratch.
     SmallVectorImpl<ForwardedRegister> &Forwards =
         FuncInfo->getForwardedMustTailRegParms();
     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
 
     // Conservatively forward AL on x86_64, since it might be used for varargs.
     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
     }
 
     // Copy all forwards from physical to virtual registers.
     for (ForwardedRegister &F : Forwards) {
       // FIXME: Can we use a less constrained schedule?
       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
     }
   }
 
   // Some CCs need callee pop.
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
     // X86 interrupts must pop the error code (and the alignment padding) if
     // present.
     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
         !Subtarget.getTargetTriple().isOSMSVCRT() &&
         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
       FuncInfo->setBytesToPopOnReturn(4);
   }
 
   if (!Is64Bit) {
     // RegSaveFrameIndex is X86-64 only.
     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
     if (CallConv == CallingConv::X86_FastCall ||
         CallConv == CallingConv::X86_ThisCall)
       // fastcc functions can't have varargs.
       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   }
 
   FuncInfo->setArgumentStackSize(StackSize);
 
   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
     if (Personality == EHPersonality::CoreCLR) {
       assert(Is64Bit);
       // TODO: Add a mechanism to frame lowering that will allow us to indicate
       // that we'd prefer this slot be allocated towards the bottom of the frame
       // (i.e. near the stack pointer after allocating the frame).  Every
       // funclet needs a copy of this slot in its (mostly empty) frame, and the
       // offset from the bottom of this and each funclet's frame must be the
       // same, so the size of funclets' (mostly empty) frames is dictated by
       // how far this slot is from the bottom (since they allocate just enough
       // space to accommodate holding this slot at the correct offset).
       int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
       EHInfo->PSPSymFrameIdx = PSPSymFI;
     }
   }
 
   if (CallConv == CallingConv::X86_RegCall ||
       F.hasFnAttribute("no_caller_saved_registers")) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
     for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
       MRI.disableCalleeSavedRegister(Pair.first);
   }
 
   return Chain;
 }
 
 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
                                             SDValue Arg, const SDLoc &dl,
                                             SelectionDAG &DAG,
                                             const CCValAssign &VA,
                                             ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                        StackPtr, PtrOff);
   if (Flags.isByVal())
     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
 
   return DAG.getStore(
       Chain, dl, Arg, PtrOff,
       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
 }
 
 /// Emit a load of return address if tail call
 /// optimization is performed and it is required.
 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
   // Adjust the Return address stack slot.
   EVT VT = getPointerTy(DAG.getDataLayout());
   OutRetAddr = getReturnAddressFrameIndex(DAG);
 
   // Load the "old" Return address.
   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
   return SDValue(OutRetAddr.getNode(), 1);
 }
 
 /// Emit a store of the return address if tail call
 /// optimization is performed and it is required (FPDiff!=0).
 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
                                         SDValue Chain, SDValue RetAddrFrIdx,
                                         EVT PtrVT, unsigned SlotSize,
                                         int FPDiff, const SDLoc &dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
   int NewReturnAddrFI =
     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
                                          false);
   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                        MachinePointerInfo::getFixedStack(
                            DAG.getMachineFunction(), NewReturnAddrFI));
   return Chain;
 }
 
 /// Returns a vector_shuffle mask for an movs{s|d}, movd
 /// operation of specified width.
 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
                        SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
   Mask.push_back(NumElems);
   for (unsigned i = 1; i != NumElems; ++i)
     Mask.push_back(i);
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 SDValue
 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
   SDLoc &dl                             = CLI.DL;
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool &isTailCall                      = CLI.IsTailCall;
   bool isVarArg                         = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget.is64Bit();
   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
   bool IsSibcall      = false;
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
   const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
   const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
   bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
                  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
 
   if (CallConv == CallingConv::X86_INTR)
     report_fatal_error("X86 interrupts may not be called directly");
 
   if (Attr.getValueAsString() == "true")
     isTailCall = false;
 
   if (Subtarget.isPICStyleGOT() &&
       !MF.getTarget().Options.GuaranteedTailCallOpt) {
     // If we are using a GOT, disable tail calls to external symbols with
     // default visibility. Tail calling such a symbol requires using a GOT
     // relocation, which forces early binding of the symbol. This breaks code
     // that require lazy function symbol resolution. Using musttail or
     // GuaranteedTailCallOpt will override this.
     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
                G->getGlobal()->hasDefaultVisibility()))
       isTailCall = false;
   }
 
   bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
   if (IsMustTail) {
     // Force this to be a tail call.  The verifier rules are enough to ensure
     // that we can lower this successfully without moving the return address
     // around.
     isTailCall = true;
   } else if (isTailCall) {
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, SR != NotStructReturn,
                     MF.getFunction().hasStructRetAttr(), CLI.RetTy,
                     Outs, OutVals, Ins, DAG);
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
       IsSibcall = true;
 
     if (isTailCall)
       ++NumTailCalls;
   }
 
   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
          "Var args not supported with calling convention fastcc, ghc or hipe");
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64.
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
   CCInfo.AnalyzeArguments(Outs, CC_X86);
 
   // In vectorcall calling convention a second pass is required for the HVA
   // types.
   if (CallingConv::X86_VectorCall == CallConv) {
     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
   }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
   if (IsSibcall)
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
            canGuaranteeTCO(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
   if (isTailCall && !IsSibcall && !IsMustTail) {
     // Lower arguments at fp - stackoffset + fpdiff.
     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
 
     FPDiff = NumBytesCallerPushed - NumBytes;
 
     // Set the delta of movement of the returnaddr stackslot.
     // But only set if delta is greater than previous delta.
     if (FPDiff < X86Info->getTCReturnAddrDelta())
       X86Info->setTCReturnAddrDelta(FPDiff);
   }
 
   unsigned NumBytesToPush = NumBytes;
   unsigned NumBytesToPop = NumBytes;
 
   // If we have an inalloca argument, all stack space has already been allocated
   // for us and be right at the top of the stack.  We don't support multiple
   // arguments passed in memory when using inalloca.
   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
     NumBytesToPush = 0;
     if (!ArgLocs.back().isMemLoc())
       report_fatal_error("cannot use inalloca attribute on a register "
                          "parameter");
     if (ArgLocs.back().getLocMemOffset() != 0)
       report_fatal_error("any parameter with the inalloca attribute must be "
                          "the only memory argument");
   }
 
   if (!IsSibcall)
     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
                                  NumBytes - NumBytesToPush, dl);
 
   SDValue RetAddrFrIdx;
   // Load return address for tail calls.
   if (isTailCall && FPDiff)
     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
                                     Is64Bit, FPDiff, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
   SDValue StackPtr;
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
   assert(isSortedByValueNo(ArgLocs) &&
          "Argument Location list must be sorted before lowering");
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++OutIndex) {
     assert(OutIndex < Outs.size() && "Invalid Out index");
     // Skip inalloca arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
     if (Flags.isInAlloca())
       continue;
 
     CCValAssign &VA = ArgLocs[I];
     EVT RegVT = VA.getLocVT();
     SDValue Arg = OutVals[OutIndex];
     bool isByVal = Flags.isByVal();
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::AExt:
       if (Arg.getValueType().isVector() &&
           Arg.getValueType().getVectorElementType() == MVT::i1)
         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
       else if (RegVT.is128BitVector()) {
         // Special case: passing MMX values in XMM registers.
         Arg = DAG.getBitcast(MVT::i64, Arg);
         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
       } else
         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getBitcast(RegVT, Arg);
       break;
     case CCValAssign::Indirect: {
       // Store the argument.
       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
       Chain = DAG.getStore(
           Chain, dl, Arg, SpillSlot,
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
       Arg = SpillSlot;
       break;
     }
     }
 
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
       // Split v64i1 value into two registers
       Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
                          Subtarget);
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       if (isVarArg && IsWin64) {
         // Win64 ABI requires argument XMM reg to be copied to the corresponding
         // shadow reg if callee is a varargs function.
         unsigned ShadowReg = 0;
         switch (VA.getLocReg()) {
         case X86::XMM0: ShadowReg = X86::RCX; break;
         case X86::XMM1: ShadowReg = X86::RDX; break;
         case X86::XMM2: ShadowReg = X86::R8; break;
         case X86::XMM3: ShadowReg = X86::R9; break;
         }
         if (ShadowReg)
           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
       }
     } else if (!IsSibcall && (!isTailCall || isByVal)) {
       assert(VA.isMemLoc());
       if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                       getPointerTy(DAG.getDataLayout()));
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
     }
   }
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   if (Subtarget.isPICStyleGOT()) {
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
     // GOT pointer.
     if (!isTailCall) {
       RegsToPass.push_back(std::make_pair(
           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
                                           getPointerTy(DAG.getDataLayout()))));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
       // address of the callee into ECX. The value in ecx is used as target of
       // the tail jump. This is done to circumvent the ebx/callee-saved problem
       // for tail calls on PIC/GOT architectures. Normally we would just put the
       // address of GOT into ebx and then call target@PLT. But for tail calls
       // ebx would be restored (since ebx is callee saved) before jumping to the
       // target@PLT.
 
       // Note: The actual moving to ECX is done further down.
       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
       if (G && !G->getGlobal()->hasLocalLinkage() &&
           G->getGlobal()->hasDefaultVisibility())
         Callee = LowerGlobalAddress(Callee, DAG);
       else if (isa<ExternalSymbolSDNode>(Callee))
         Callee = LowerExternalSymbol(Callee, DAG);
     }
   }
 
   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
     // (prototype-less calls or calls to functions containing ellipsis (...) in
     // the declaration) %al is used as hidden argument to specify the number
     // of SSE registers used. The contents of %al do not need to match exactly
     // the number of registers, but must be an ubound on the number of SSE
     // registers used and is in the range 0 - 8 inclusive.
 
     // Count the number of XMM registers allocated.
     static const MCPhysReg XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
     assert((Subtarget.hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
 
     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
                                         DAG.getConstant(NumXMMRegs, dl,
                                                         MVT::i8)));
   }
 
   if (isVarArg && IsMustTail) {
     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
     }
   }
 
   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   // don't need this because the eligibility check rejects calls that require
   // shuffling arguments passed in memory.
   if (!IsSibcall && isTailCall) {
     // Force all the incoming stack arguments to be loaded from the stack
     // before any new outgoing arguments are stored to the stack, because the
     // outgoing stack slots may alias the incoming argument stack slots, and
     // the alias isn't otherwise explicit. This is slightly more conservative
     // than necessary, because it means that each store effectively depends
     // on every argument instead of just those arguments it would clobber.
     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
 
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
          ++I, ++OutsIndex) {
       CCValAssign &VA = ArgLocs[I];
 
       if (VA.isRegLoc()) {
         if (VA.needsCustom()) {
           assert((CallConv == CallingConv::X86_RegCall) &&
                  "Expecting custom case only in regcall calling convention");
           // This means that we are in special case where one argument was
           // passed through two register locations - Skip the next location
           ++I;
         }
 
         continue;
       }
 
       assert(VA.isMemLoc());
       SDValue Arg = OutVals[OutsIndex];
       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
       // Skip inalloca arguments.  They don't require any work.
       if (Flags.isInAlloca())
         continue;
       // Create frame index.
       int32_t Offset = VA.getLocMemOffset()+FPDiff;
       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
 
       if (Flags.isByVal()) {
         // Copy relative to framepointer.
         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
         if (!StackPtr.getNode())
           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                         getPointerTy(DAG.getDataLayout()));
         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                              StackPtr, Source);
 
         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
                                                          ArgChain,
                                                          Flags, DAG, dl));
       } else {
         // Store relative to framepointer.
         MemOpChains2.push_back(DAG.getStore(
             ArgChain, dl, Arg, FIN,
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
       }
     }
 
     if (!MemOpChains2.empty())
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
 
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
                                      getPointerTy(DAG.getDataLayout()),
                                      RegInfo->getSlotSize(), FPDiff, dl);
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into registers.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                              RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
     // through a register, since the call instruction's 32-bit
     // pc-relative offset may not be large enough to hold the whole
     // address.
   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
     // If the callee is a GlobalAddress node (quite common, every direct call
     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
     // it.
     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
 
     // We should use extra load for direct calls to dllimported functions in
     // non-JIT mode.
     const GlobalValue *GV = G->getGlobal();
     if (!GV->hasDLLImportStorageClass()) {
       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
 
       Callee = DAG.getTargetGlobalAddress(
           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
 
       if (OpFlags == X86II::MO_GOTPCREL) {
         // Add a wrapper.
         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
           getPointerTy(DAG.getDataLayout()), Callee);
         // Add extra indirection
         Callee = DAG.getLoad(
             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
             MachinePointerInfo::getGOT(DAG.getMachineFunction()));
       }
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
     unsigned char OpFlags =
         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
 
     Callee = DAG.getTargetExternalSymbol(
         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
   } else if (Subtarget.isTarget64BitILP32() &&
              Callee->getValueType(0) == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   }
 
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SmallVector<SDValue, 8> Ops;
 
   if (!IsSibcall && isTailCall) {
     Chain = DAG.getCALLSEQ_END(Chain,
                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
   if (isTailCall)
     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
   // set X86_INTR calling convention because it has the same CSR mask
   // (same preserved registers).
   const uint32_t *Mask = RegInfo->getCallPreservedMask(
       MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
 
   // If this is an invoke in a 32-bit function using a funclet-based
   // personality, assume the function clobbers all registers. If an exception
   // is thrown, the runtime will not restore CSRs.
   // FIXME: Model this more precisely so that we can register allocate across
   // the normal edge and spill and fill across the exceptional edge.
   if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
     const Function &CallerFn = MF.getFunction();
     EHPersonality Pers =
         CallerFn.hasPersonalityFn()
             ? classifyEHPersonality(CallerFn.getPersonalityFn())
             : EHPersonality::Unknown;
     if (isFuncletEHPersonality(Pers))
       Mask = RegInfo->getNoPreservedMask();
   }
 
   // Define a new register mask from the existing mask.
   uint32_t *RegMask = nullptr;
 
   // In some calling conventions we need to remove the used physical registers
   // from the reg mask.
   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
     // Allocate a new Reg Mask and copy Mask.
     RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
     unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
     memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
 
     // Make sure all sub registers of the argument registers are reset
     // in the RegMask.
     for (auto const &RegPair : RegsToPass)
       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
 
     // Create the RegMask Operand according to our updated mask.
     Ops.push_back(DAG.getRegisterMask(RegMask));
   } else {
     // Create the RegMask Operand according to the static mask.
     Ops.push_back(DAG.getRegisterMask(Mask));
   }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
   if (isTailCall) {
     // We used to do:
     //// If this is the first return lowered for this function, add the regs
     //// to the liveout set for the function.
     // This isn't right, although it's probably harmless on x86; liveouts
     // should be computed from returns not tail calls.  Consider a void
     // function making a tail call to a function returning int.
     MF.getFrameInfo().setHasTailCall();
     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   }
 
   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPop;
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                        DAG.getTarget().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
            !Subtarget.getTargetTriple().isOSMSVCRT() &&
            SR == StackStructReturn)
     // If this is a call to a struct-return function, the callee
     // pops the hidden struct pointer, so we have to push it back.
     // This is common for Darwin/X86, Linux & Mingw32 targets.
     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
     NumBytesForCalleeToPop = 4;
   else
     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
 
   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
     // No need to reset the stack after the call if the call doesn't return. To
     // make the MI verify, we'll pretend the callee does it for us.
     NumBytesForCalleeToPop = NumBytes;
   }
 
   // Returns a flag for retval copy to use.
   if (!IsSibcall) {
     Chain = DAG.getCALLSEQ_END(Chain,
                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
                                                      true),
                                InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
                          InVals, RegMask);
 }
 
 //===----------------------------------------------------------------------===//
 //                Fast Calling Convention (tail call) implementation
 //===----------------------------------------------------------------------===//
 
 //  Like std call, callee cleans arguments, convention except that ECX is
 //  reserved for storing the tail called function address. Only 2 registers are
 //  free for argument passing (inreg). Tail call optimization is performed
 //  provided:
 //                * tailcallopt is enabled
 //                * caller/callee are fastcc
 //  On X86_64 architecture with GOT-style position independent code only local
 //  (within module) calls are supported at the moment.
 //  To keep the stack aligned according to platform abi the function
 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
 //  If a tail called function callee has more arguments than the caller the
 //  caller needs to make sure that there is room to move the RETADDR to. This is
 //  achieved by reserving an area the size of the argument delta right after the
 //  original RETADDR, but before the saved framepointer or the spilled registers
 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
 //  stack layout:
 //    arg1
 //    arg2
 //    RETADDR
 //    [ new RETADDR
 //      move area ]
 //    (possible EBP)
 //    ESI
 //    EDI
 //    local1 ..
 
 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
 /// requirement.
 unsigned
 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                SelectionDAG& DAG) const {
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
   int64_t Offset = StackSize;
   unsigned SlotSize = RegInfo->getSlotSize();
   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
     // Number smaller than 12 so just add the difference.
     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   } else {
     // Mask out lower bits, add stackalignment once plus the 12 bytes.
     Offset = ((~AlignMask) & Offset) + StackAlignment +
       (StackAlignment-SlotSize);
   }
   return Offset;
 }
 
 /// Return true if the given stack call argument is already available in the
 /// same position (relatively) of the caller's incoming argument stack.
 static
 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
                          const X86InstrInfo *TII, const CCValAssign &VA) {
   unsigned Bytes = Arg.getValueSizeInBits() / 8;
 
   for (;;) {
     // Look through nodes that don't alter the bits of the incoming value.
     unsigned Op = Arg.getOpcode();
     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
       Arg = Arg.getOperand(0);
       continue;
     }
     if (Op == ISD::TRUNCATE) {
       const SDValue &TruncInput = Arg.getOperand(0);
       if (TruncInput.getOpcode() == ISD::AssertZext &&
           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
               Arg.getValueType()) {
         Arg = TruncInput.getOperand(0);
         continue;
       }
     }
     break;
   }
 
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
     if (!TargetRegisterInfo::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
     if (!Def)
       return false;
     if (!Flags.isByVal()) {
       if (!TII->isLoadFromStackSlot(*Def, FI))
         return false;
     } else {
       unsigned Opcode = Def->getOpcode();
       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
            Opcode == X86::LEA64_32r) &&
           Def->getOperand(1).isFI()) {
         FI = Def->getOperand(1).getIndex();
         Bytes = Flags.getByValSize();
       } else
         return false;
     }
   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
     if (Flags.isByVal())
       // ByVal argument is passed in as a pointer but it's now being
       // dereferenced. e.g.
       // define @foo(%struct.X* %A) {
       //   tail call @bar(%struct.X* byval %A)
       // }
       return false;
     SDValue Ptr = Ld->getBasePtr();
     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
     if (!FINode)
       return false;
     FI = FINode->getIndex();
   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
     FI = FINode->getIndex();
     Bytes = Flags.getByValSize();
   } else
     return false;
 
   assert(FI != INT_MAX);
   if (!MFI.isFixedObjectIndex(FI))
     return false;
 
   if (Offset != MFI.getObjectOffset(FI))
     return false;
 
   // If this is not byval, check that the argument stack object is immutable.
   // inalloca and argument copy elision can create mutable argument stack
   // objects. Byval objects can be mutated, but a byval call intends to pass the
   // mutated memory.
   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
     return false;
 
   if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
     // If the argument location is wider than the argument type, check that any
     // extension flags match.
     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
         Flags.isSExt() != MFI.isObjectSExt(FI)) {
       return false;
     }
   }
 
   return Bytes == MFI.getObjectSize(FI);
 }
 
 /// Check whether the call is eligible for tail call optimization. Targets
 /// that want to do tail call optimization should implement this function.
 bool X86TargetLowering::IsEligibleForTailCallOptimization(
     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
   // If -tailcallopt is specified, make fastcc functions tail-callable.
   MachineFunction &MF = DAG.getMachineFunction();
   const Function &CallerF = MF.getFunction();
 
   // If the function return type is x86_fp80 and the callee return type is not,
   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   // perform a tailcall optimization here.
   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
     return false;
 
   CallingConv::ID CallerCC = CallerF.getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
 
   // Win64 functions have extra shadow space for argument homing. Don't do the
   // sibcall if the caller and callee have mismatched expectations for this
   // space.
   if (IsCalleeWin64 != IsCallerWin64)
     return false;
 
   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
     if (canGuaranteeTCO(CalleeCC) && CCMatch)
       return true;
     return false;
   }
 
   // Look for obvious safe cases to perform tail call optimization that do not
   // require ABI changes. This is what gcc calls sibcall.
 
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
   // Do not sibcall optimize vararg calls unless all arguments are passed via
   // registers.
   LLVMContext &C = *DAG.getContext();
   if (isVarArg && !Outs.empty()) {
     // Optimizing for varargs on Win64 is unlikely to be safe without
     // additional testing.
     if (IsCalleeWin64 || IsCallerWin64)
       return false;
 
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
       if (!ArgLocs[i].isRegLoc())
         return false;
   }
 
   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   // this into a sibcall.
   bool Unused = false;
   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     if (!Ins[i].Used) {
       Unused = true;
       break;
     }
   }
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
         return false;
     }
   }
 
   // Check that the call results are passed in the same way.
   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
                                   RetCC_X86, RetCC_X86))
     return false;
   // The callee has to preserve all registers the caller needs to preserve.
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   if (!CCMatch) {
     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
   }
 
   unsigned StackArgsSize = 0;
 
   // If the callee takes no arguments then go on to check the results of the
   // call.
   if (!Outs.empty()) {
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     // Allocate shadow area for Win64
     if (IsCalleeWin64)
       CCInfo.AllocateStack(32, 8);
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     StackArgsSize = CCInfo.getNextStackOffset();
 
     if (CCInfo.getNextStackOffset()) {
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
       MachineFrameInfo &MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII = Subtarget.getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
         ISD::ArgFlagsTy Flags = Outs[i].Flags;
         if (VA.getLocInfo() == CCValAssign::Indirect)
           return false;
         if (!VA.isRegLoc()) {
           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
                                    MFI, MRI, TII, VA))
             return false;
         }
       }
     }
 
     bool PositionIndependent = isPositionIndependent();
     // If the tailcall address may be in a register, then make sure it's
     // possible to register allocate for it. In 32-bit, the call address can
     // only target EAX, EDX, or ECX since the tail call must be scheduled after
     // callee-saved registers are restored. These happen to be the same
     // registers used to pass 'inreg' arguments so watch out for those.
     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
                                   !isa<ExternalSymbolSDNode>(Callee)) ||
                                  PositionIndependent)) {
       unsigned NumInRegs = 0;
       // In PIC we need an extra register to formulate the address computation
       // for the callee.
       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
 
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         if (!VA.isRegLoc())
           continue;
         unsigned Reg = VA.getLocReg();
         switch (Reg) {
         default: break;
         case X86::EAX: case X86::EDX: case X86::ECX:
           if (++NumInRegs == MaxInRegs)
             return false;
           break;
         }
       }
     }
 
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
       return false;
   }
 
   bool CalleeWillPop =
       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
                        MF.getTarget().Options.GuaranteedTailCallOpt);
 
   if (unsigned BytesToPop =
           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
     // If we have bytes to pop, the callee must pop them.
     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
     if (!CalleePopMatches)
       return false;
   } else if (CalleeWillPop && StackArgsSize > 0) {
     // If we don't have bytes to pop, make sure the callee doesn't pop any.
     return false;
   }
 
   return true;
 }
 
 FastISel *
 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                   const TargetLibraryInfo *libInfo) const {
   return X86::createFastISel(funcInfo, libInfo);
 }
 
 //===----------------------------------------------------------------------===//
 //                           Other Lowering Hooks
 //===----------------------------------------------------------------------===//
 
 static bool MayFoldLoad(SDValue Op) {
   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
 }
 
 static bool MayFoldIntoStore(SDValue Op) {
   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
 }
 
 static bool MayFoldIntoZeroExtend(SDValue Op) {
   if (Op.hasOneUse()) {
     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
     return (ISD::ZERO_EXTEND == Opcode);
   }
   return false;
 }
 
 static bool isTargetShuffle(unsigned Opcode) {
   switch(Opcode) {
   default: return false;
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::SHUFP:
   case X86ISD::INSERTPS:
   case X86ISD::EXTRQI:
   case X86ISD::INSERTQI:
   case X86ISD::PALIGNR:
   case X86ISD::VSHLDQ:
   case X86ISD::VSRLDQ:
   case X86ISD::MOVLHPS:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVLPS:
   case X86ISD::MOVLPD:
   case X86ISD::MOVSHDUP:
   case X86ISD::MOVSLDUP:
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
   case X86ISD::VBROADCAST:
   case X86ISD::VPERMILPI:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERM2X128:
   case X86ISD::VPERMIL2:
   case X86ISD::VPERMI:
   case X86ISD::VPPERM:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
   case X86ISD::VPERMIV3:
   case X86ISD::VZEXT_MOVL:
     return true;
   }
 }
 
 static bool isTargetShuffleVariableMask(unsigned Opcode) {
   switch (Opcode) {
   default: return false;
   // Target Shuffles.
   case X86ISD::PSHUFB:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERMIL2:
   case X86ISD::VPPERM:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
   case X86ISD::VPERMIV3:
     return true;
   // 'Faux' Target Shuffles.
   case ISD::AND:
   case X86ISD::ANDNP:
     return true;
   }
 }
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
     unsigned SlotSize = RegInfo->getSlotSize();
     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
                                                           -(int64_t)SlotSize,
                                                           false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
 }
 
 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                        bool hasSymbolicDisplacement) {
   // Offset should fit into 32 bit immediate field.
   if (!isInt<32>(Offset))
     return false;
 
   // If we don't have a symbolic displacement - we don't have any extra
   // restrictions.
   if (!hasSymbolicDisplacement)
     return true;
 
   // FIXME: Some tweaks might be needed for medium code model.
   if (M != CodeModel::Small && M != CodeModel::Kernel)
     return false;
 
   // For small code model we assume that latest object is 16MB before end of 31
   // bits boundary. We may also accept pretty large negative constants knowing
   // that all objects are in the positive half of address space.
   if (M == CodeModel::Small && Offset < 16*1024*1024)
     return true;
 
   // For kernel code model we know that all object resist in the negative half
   // of 32bits address space. We may not accept negative offsets, since they may
   // be just off and we may accept pretty large positive ones.
   if (M == CodeModel::Kernel && Offset >= 0)
     return true;
 
   return false;
 }
 
 /// Determines whether the callee is required to pop its own arguments.
 /// Callee pop is necessary to support tail calls.
 bool X86::isCalleePop(CallingConv::ID CallingConv,
                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
   // can guarantee TCO.
   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
     return true;
 
   switch (CallingConv) {
   default:
     return false;
   case CallingConv::X86_StdCall:
   case CallingConv::X86_FastCall:
   case CallingConv::X86_ThisCall:
   case CallingConv::X86_VectorCall:
     return !is64Bit;
   }
 }
 
 /// \brief Return true if the condition is an unsigned comparison operation.
 static bool isX86CCUnsigned(unsigned X86CC) {
   switch (X86CC) {
   default:
     llvm_unreachable("Invalid integer condition!");
   case X86::COND_E:
   case X86::COND_NE:
   case X86::COND_B:
   case X86::COND_A:
   case X86::COND_BE:
   case X86::COND_AE:
     return true;
   case X86::COND_G:
   case X86::COND_GE:
   case X86::COND_L:
   case X86::COND_LE:
     return false;
   }
 }
 
 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   switch (SetCCOpcode) {
   default: llvm_unreachable("Invalid integer condition!");
   case ISD::SETEQ:  return X86::COND_E;
   case ISD::SETGT:  return X86::COND_G;
   case ISD::SETGE:  return X86::COND_GE;
   case ISD::SETLT:  return X86::COND_L;
   case ISD::SETLE:  return X86::COND_LE;
   case ISD::SETNE:  return X86::COND_NE;
   case ISD::SETULT: return X86::COND_B;
   case ISD::SETUGT: return X86::COND_A;
   case ISD::SETULE: return X86::COND_BE;
   case ISD::SETUGE: return X86::COND_AE;
   }
 }
 
 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
 /// condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
                                bool isFP, SDValue &LHS, SDValue &RHS,
                                SelectionDAG &DAG) {
   if (!isFP) {
     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
         // X > -1   -> X == 0, jump !sign.
         RHS = DAG.getConstant(0, DL, RHS.getValueType());
         return X86::COND_NS;
       }
       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
         // X < 0   -> X == 0, jump on sign.
         return X86::COND_S;
       }
       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
         // X < 1   -> X <= 0
         RHS = DAG.getConstant(0, DL, RHS.getValueType());
         return X86::COND_LE;
       }
     }
 
     return TranslateIntegerX86CC(SetCCOpcode);
   }
 
   // First determine if it is required or is profitable to flip the operands.
 
   // If LHS is a foldable load, but RHS is not, flip the condition.
   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
       !ISD::isNON_EXTLoad(RHS.getNode())) {
     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
     std::swap(LHS, RHS);
   }
 
   switch (SetCCOpcode) {
   default: break;
   case ISD::SETOLT:
   case ISD::SETOLE:
   case ISD::SETUGT:
   case ISD::SETUGE:
     std::swap(LHS, RHS);
     break;
   }
 
   // On a floating point condition, the flags are set as follows:
   // ZF  PF  CF   op
   //  0 | 0 | 0 | X > Y
   //  0 | 0 | 1 | X < Y
   //  1 | 0 | 0 | X == Y
   //  1 | 1 | 1 | unordered
   switch (SetCCOpcode) {
   default: llvm_unreachable("Condcode should be pre-legalized away");
   case ISD::SETUEQ:
   case ISD::SETEQ:   return X86::COND_E;
   case ISD::SETOLT:              // flipped
   case ISD::SETOGT:
   case ISD::SETGT:   return X86::COND_A;
   case ISD::SETOLE:              // flipped
   case ISD::SETOGE:
   case ISD::SETGE:   return X86::COND_AE;
   case ISD::SETUGT:              // flipped
   case ISD::SETULT:
   case ISD::SETLT:   return X86::COND_B;
   case ISD::SETUGE:              // flipped
   case ISD::SETULE:
   case ISD::SETLE:   return X86::COND_BE;
   case ISD::SETONE:
   case ISD::SETNE:   return X86::COND_NE;
   case ISD::SETUO:   return X86::COND_P;
   case ISD::SETO:    return X86::COND_NP;
   case ISD::SETOEQ:
   case ISD::SETUNE:  return X86::COND_INVALID;
   }
 }
 
 /// Is there a floating point cmov for the specific X86 condition code?
 /// Current x86 isa includes the following FP cmov instructions:
 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
 static bool hasFPCMov(unsigned X86CC) {
   switch (X86CC) {
   default:
     return false;
   case X86::COND_B:
   case X86::COND_BE:
   case X86::COND_E:
   case X86::COND_P:
   case X86::COND_A:
   case X86::COND_AE:
   case X86::COND_NE:
   case X86::COND_NP:
     return true;
   }
 }
 
 
 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
                                            MachineFunction &MF,
                                            unsigned Intrinsic) const {
 
   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
   if (!IntrData)
     return false;
 
   Info.opc = ISD::INTRINSIC_W_CHAIN;
   Info.flags = MachineMemOperand::MONone;
   Info.offset = 0;
 
   switch (IntrData->Type) {
   case EXPAND_FROM_MEM: {
     Info.ptrVal = I.getArgOperand(0);
     Info.memVT = MVT::getVT(I.getType());
     Info.align = 1;
     Info.flags |= MachineMemOperand::MOLoad;
     break;
   }
   case COMPRESS_TO_MEM: {
     Info.ptrVal = I.getArgOperand(0);
     Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
     Info.align = 1;
     Info.flags |= MachineMemOperand::MOStore;
     break;
   }
   case TRUNCATE_TO_MEM_VI8:
   case TRUNCATE_TO_MEM_VI16:
   case TRUNCATE_TO_MEM_VI32: {
     Info.ptrVal = I.getArgOperand(0);
     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
       ScalarVT = MVT::i8;
     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
       ScalarVT = MVT::i16;
     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
       ScalarVT = MVT::i32;
 
     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
     Info.align = 1;
     Info.flags |= MachineMemOperand::MOStore;
     break;
   }
   default:
     return false;
   }
 
   return true;
 }
 
 /// Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
       return true;
   }
   return false;
 }
 
 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
                                               ISD::LoadExtType ExtTy,
                                               EVT NewVT) const {
   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
   // relocation target a movq or addq instruction: don't let the load shrink.
   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
   return true;
 }
 
 /// \brief Returns true if it is beneficial to convert a load of a constant
 /// to just the constant itself.
 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                           Type *Ty) const {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   if (BitSize == 0 || BitSize > 64)
     return false;
   return true;
 }
 
 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
   // TODO: It might be a win to ease or lift this restriction, but the generic
   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
   if (VT.isVector() && Subtarget.hasAVX512())
     return false;
 
   return true;
 }
 
 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                                 unsigned Index) const {
   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
     return false;
 
   // Mask vectors support all subregister combinations and operations that
   // extract half of vector.
   if (ResVT.getVectorElementType() == MVT::i1)
     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
                           (Index == ResVT.getVectorNumElements()));
 
   return (Index % ResVT.getVectorNumElements()) == 0;
 }
 
 bool X86TargetLowering::isCheapToSpeculateCttz() const {
   // Speculate cttz only if we can directly use TZCNT.
   return Subtarget.hasBMI();
 }
 
 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   // Speculate ctlz only if we can directly use LZCNT.
   return Subtarget.hasLZCNT();
 }
 
 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
                                                 EVT BitcastVT) const {
   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
     return false;
 
   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
 }
 
 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                                          const SelectionDAG &DAG) const {
   // Do not merge to float value size (128 bytes) if no implicit
   // float attribute is set.
   bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
       Attribute::NoImplicitFloat);
 
   if (NoFloat) {
     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
     return (MemVT.getSizeInBits() <= MaxIntSize);
   }
   return true;
 }
 
 bool X86TargetLowering::isCtlzFast() const {
   return Subtarget.hasFastLZCNT();
 }
 
 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
     const Instruction &AndI) const {
   return true;
 }
 
 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   if (!Subtarget.hasBMI())
     return false;
 
   // There are only 32-bit and 64-bit forms for 'andn'.
   EVT VT = Y.getValueType();
   if (VT != MVT::i32 && VT != MVT::i64)
     return false;
 
   return true;
 }
 
 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
   MVT VT = MVT::getIntegerVT(NumBits);
   if (isTypeLegal(VT))
     return VT;
 
   // PMOVMSKB can handle this.
   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
     return MVT::v16i8;
 
   // VPMOVMSKB can handle this.
   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
     return MVT::v32i8;
 
   // TODO: Allow 64-bit type for 32-bit target.
   // TODO: 512-bit types should be allowed, but make sure that those
   // cases are handled in combineVectorSizedSetCCEquality().
 
   return MVT::INVALID_SIMPLE_VALUE_TYPE;
 }
 
 /// Val is the undef sentinel value or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
 }
 
 /// Val is either the undef or zero sentinel value.
 static bool isUndefOrZero(int Val) {
   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
 }
 
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size is the undef sentinel value.
 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
     if (Mask[i] != SM_SentinelUndef)
       return false;
   return true;
 }
 
 /// Return true if Val is undef or if its value falls within the
 /// specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
 }
 
 /// Return true if every element in Mask is undef or if its value
 /// falls within the specified range (L, H].
 static bool isUndefOrInRange(ArrayRef<int> Mask,
                              int Low, int Hi) {
   for (int M : Mask)
     if (!isUndefOrInRange(M, Low, Hi))
       return false;
   return true;
 }
 
 /// Return true if Val is undef, zero or if its value falls within the
 /// specified range (L, H].
 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
   return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
 }
 
 /// Return true if every element in Mask is undef, zero or if its value
 /// falls within the specified range (L, H].
 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
   for (int M : Mask)
     if (!isUndefOrZeroOrInRange(M, Low, Hi))
       return false;
   return true;
 }
 
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (Low, Low+Size]. or is undef.
 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
                                        unsigned Pos, unsigned Size, int Low) {
   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
     if (!isUndefOrEqual(Mask[i], Low))
       return false;
   return true;
 }
 
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (Low, Low+Size], or is undef or is zero.
 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
                                              unsigned Size, int Low) {
   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
       return false;
   return true;
 }
 
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size is undef or is zero.
 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
                                  unsigned Size) {
   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
     if (!isUndefOrZero(Mask[i]))
       return false;
   return true;
 }
 
 /// \brief Helper function to test whether a shuffle mask could be
 /// simplified by widening the elements being shuffled.
 ///
 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
 /// leaves it in an unspecified state.
 ///
 /// NOTE: This must handle normal vector shuffle masks and *target* vector
 /// shuffle masks. The latter have the special property of a '-2' representing
 /// a zero-ed lane of a vector.
 static bool canWidenShuffleElements(ArrayRef<int> Mask,
                                     SmallVectorImpl<int> &WidenedMask) {
   WidenedMask.assign(Mask.size() / 2, 0);
   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
     int M0 = Mask[i];
     int M1 = Mask[i + 1];
 
     // If both elements are undef, its trivial.
     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
       WidenedMask[i / 2] = SM_SentinelUndef;
       continue;
     }
 
     // Check for an undef mask and a mask value properly aligned to fit with
     // a pair of values. If we find such a case, use the non-undef mask's value.
     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
       WidenedMask[i / 2] = M1 / 2;
       continue;
     }
     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
       WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
     // When zeroing, we need to spread the zeroing across both lanes to widen.
     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
         WidenedMask[i / 2] = SM_SentinelZero;
         continue;
       }
       return false;
     }
 
     // Finally check if the two mask values are adjacent and aligned with
     // a pair.
     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
       WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
     // Otherwise we can't safely widen the elements used in this shuffle.
     return false;
   }
   assert(WidenedMask.size() == Mask.size() / 2 &&
          "Incorrect size of mask after widening the elements!");
 
   return true;
 }
 
 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
   return isNullConstant(Elt) || isNullFPConstant(Elt);
 }
 
 // Build a vector of constants.
 // Use an UNDEF node if MaskElt == -1.
 // Split 64-bit constants in the 32-bit mode.
 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
                               const SDLoc &dl, bool IsMask = false) {
 
   SmallVector<SDValue, 32>  Ops;
   bool Split = false;
 
   MVT ConstVecVT = VT;
   unsigned NumElts = VT.getVectorNumElements();
   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
     Split = true;
   }
 
   MVT EltVT = ConstVecVT.getVectorElementType();
   for (unsigned i = 0; i < NumElts; ++i) {
     bool IsUndef = Values[i] < 0 && IsMask;
     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
       DAG.getConstant(Values[i], dl, EltVT);
     Ops.push_back(OpNode);
     if (Split)
       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
                     DAG.getConstant(0, dl, EltVT));
   }
   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   if (Split)
     ConstsNode = DAG.getBitcast(VT, ConstsNode);
   return ConstsNode;
 }
 
 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert(Bits.size() == Undefs.getBitWidth() &&
          "Unequal constant and undef arrays");
   SmallVector<SDValue, 32> Ops;
   bool Split = false;
 
   MVT ConstVecVT = VT;
   unsigned NumElts = VT.getVectorNumElements();
   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
     Split = true;
   }
 
   MVT EltVT = ConstVecVT.getVectorElementType();
   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
     if (Undefs[i]) {
       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
       continue;
     }
     const APInt &V = Bits[i];
     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
     if (Split) {
       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
     } else if (EltVT == MVT::f32) {
       APFloat FV(APFloat::IEEEsingle(), V);
       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
     } else if (EltVT == MVT::f64) {
       APFloat FV(APFloat::IEEEdouble(), V);
       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
     } else {
       Ops.push_back(DAG.getConstant(V, dl, EltVT));
     }
   }
 
   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   return DAG.getBitcast(VT, ConstsNode);
 }
 
 /// Returns a vector of specified type with all zero elements.
 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
           VT.getVectorElementType() == MVT::i1) &&
          "Unexpected vector type");
 
   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
   // type. This ensures they get CSE'd. But if the integer type is not
   // available, use a floating-point +0.0 instead.
   SDValue Vec;
   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
   } else if (VT.getVectorElementType() == MVT::i1) {
     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
            "Unexpected vector type");
     assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
            "Unexpected vector type");
     Vec = DAG.getConstant(0, dl, VT);
   } else {
     unsigned Num32BitElts = VT.getSizeInBits() / 32;
     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
   }
   return DAG.getBitcast(VT, Vec);
 }
 
 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
                                 const SDLoc &dl, unsigned vectorWidth) {
   EVT VT = Vec.getValueType();
   EVT ElVT = VT.getVectorElementType();
   unsigned Factor = VT.getSizeInBits()/vectorWidth;
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                   VT.getVectorNumElements()/Factor);
 
   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
   // This is the index of the first element of the vectorWidth-bit chunk
   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   IdxVal &= ~(ElemsPerChunk - 1);
 
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
     return DAG.getBuildVector(ResultVT, dl,
                               Vec->ops().slice(IdxVal, ElemsPerChunk));
 
   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 }
 
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 /// instructions or a simple subregister reference. Idx is an index in the
 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering EXTRACT_VECTOR_ELT operations easier.
 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
                                    SelectionDAG &DAG, const SDLoc &dl) {
   assert((Vec.getValueType().is256BitVector() ||
           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
 }
 
 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
                                    SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
 }
 
 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                SelectionDAG &DAG, const SDLoc &dl,
                                unsigned vectorWidth) {
   assert((vectorWidth == 128 || vectorWidth == 256) &&
          "Unsupported vector width");
   // Inserting UNDEF is Result
   if (Vec.isUndef())
     return Result;
   EVT VT = Vec.getValueType();
   EVT ElVT = VT.getVectorElementType();
   EVT ResultVT = Result.getValueType();
 
   // Insert the relevant vectorWidth bits.
   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
   // This is the index of the first element of the vectorWidth-bit chunk
   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   IdxVal &= ~(ElemsPerChunk - 1);
 
   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 }
 
 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 /// simple superregister reference.  Idx is an index in the 128 bits
 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                   SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                   SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
 }
 
 // Return true if the instruction zeroes the unused upper part of the
 // destination and accepts mask.
 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
   switch (Opcode) {
   default:
     return false;
   case X86ISD::TESTM:
   case X86ISD::TESTNM:
   case X86ISD::PCMPEQM:
   case X86ISD::PCMPGTM:
   case X86ISD::CMPM:
   case X86ISD::CMPMU:
   case X86ISD::CMPM_RND:
     return true;
   }
 }
 
 /// Insert i1-subvector to i1-vector.
 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
 
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue SubVec = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
 
   if (!isa<ConstantSDNode>(Idx))
     return SDValue();
 
   // Inserting undef is a nop. We can just return the original vector.
   if (SubVec.isUndef())
     return Vec;
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
     return Op;
 
   MVT OpVT = Op.getSimpleValueType();
   unsigned NumElems = OpVT.getVectorNumElements();
 
   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
 
   // Extend to natively supported kshift.
   MVT WideOpVT = OpVT;
   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
     WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
 
   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
   // if necessary.
   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
     // May need to promote to a legal type.
     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                      getZeroVector(WideOpVT, Subtarget, DAG, dl),
                      SubVec, Idx);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
   MVT SubVecVT = SubVec.getSimpleValueType();
   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
 
   assert(IdxVal + SubVecNumElems <= NumElems &&
          IdxVal % SubVecVT.getSizeInBits() == 0 &&
          "Unexpected index value in INSERT_SUBVECTOR");
 
   SDValue Undef = DAG.getUNDEF(WideOpVT);
 
   if (IdxVal == 0) {
     // Zero lower bits of the Vec
     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
                       ZeroIdx);
     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
     // Merge them together, SubVec should be zero extended.
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                          getZeroVector(WideOpVT, Subtarget, DAG, dl),
                          SubVec, ZeroIdx);
     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                        Undef, SubVec, ZeroIdx);
 
   if (Vec.isUndef()) {
     assert(IdxVal != 0 && "Unexpected index");
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
                          DAG.getConstant(IdxVal, dl, MVT::i8));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   }
 
   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
     assert(IdxVal != 0 && "Unexpected index");
     NumElems = WideOpVT.getVectorNumElements();
     unsigned ShiftLeft = NumElems - SubVecNumElems;
     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
                          DAG.getConstant(ShiftLeft, dl, MVT::i8));
     if (ShiftRight != 0)
       SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
                            DAG.getConstant(ShiftRight, dl, MVT::i8));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   }
 
   // Simple case when we put subvector in the upper part
   if (IdxVal + SubVecNumElems == NumElems) {
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
                          DAG.getConstant(IdxVal, dl, MVT::i8));
     if (SubVecNumElems * 2 == NumElems) {
       // Special case, use legal zero extending insert_subvector. This allows
       // isel to opimitize when bits are known zero.
       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                         getZeroVector(WideOpVT, Subtarget, DAG, dl),
                         Vec, ZeroIdx);
     } else {
       // Otherwise use explicit shifts to zero the bits.
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                         Undef, Vec, ZeroIdx);
       NumElems = WideOpVT.getVectorNumElements();
       SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     }
     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
   // Inserting into the middle is more complicated.
 
   NumElems = WideOpVT.getVectorNumElements();
 
   // Widen the vector if needed.
   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
   // Move the current value of the bit to be replace to the lsbs.
   Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
                    DAG.getConstant(IdxVal, dl, MVT::i8));
   // Xor with the new bit.
   Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
   // Shift to MSB, filling bottom bits with 0.
   unsigned ShiftLeft = NumElems - SubVecNumElems;
   Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
                    DAG.getConstant(ShiftLeft, dl, MVT::i8));
   // Shift to the final position, filling upper bits with 0.
   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
   Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
                        DAG.getConstant(ShiftRight, dl, MVT::i8));
   // Xor with original vector leaving the new value.
   Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
   // Reduce to original width if needed.
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
 }
 
 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 /// instructions. This is used because creating CONCAT_VECTOR nodes of
 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
 /// large BUILD_VECTORS.
 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
                                    unsigned NumElems, SelectionDAG &DAG,
                                    const SDLoc &dl) {
   SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
 }
 
 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
                                    unsigned NumElems, SelectionDAG &DAG,
                                    const SDLoc &dl) {
   SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
 }
 
 /// Returns a vector of specified type with all bits set.
 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
 /// Then bitcast to their original type, ensuring they get CSE'd.
 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
 
   APInt Ones = APInt::getAllOnesValue(32);
   unsigned NumElts = VT.getSizeInBits() / 32;
   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
   return DAG.getBitcast(VT, Vec);
 }
 
 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
                               SelectionDAG &DAG) {
   EVT InVT = In.getValueType();
   assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
 
   if (VT.is128BitVector() && InVT.is128BitVector())
     return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
                                 : DAG.getZeroExtendVectorInReg(In, DL, VT);
 
   // For 256-bit vectors, we only need the lower (128-bit) input half.
   // For 512-bit vectors, we only need the lower input half or quarter.
   if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
     int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
     In = extractSubVector(In, 0, DAG, DL,
                           std::max(128, (int)VT.getSizeInBits() / Scale));
   }
 
   return DAG.getNode(Opc, DL, VT, In);
 }
 
 /// Returns a vector_shuffle node for an unpackl operation.
 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
                           SDValue V1, SDValue V2) {
   SmallVector<int, 8> Mask;
   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 /// Returns a vector_shuffle node for an unpackh operation.
 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
                           SDValue V1, SDValue V2) {
   SmallVector<int, 8> Mask;
   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 /// Return a vector_shuffle of the specified vector of zero or undef vector.
 /// This produces a shuffle where the low element of V2 is swizzled into the
 /// zero/undef vector, landing at element Idx.
 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
                                            bool IsZero,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   MVT VT = V2.getSimpleValueType();
   SDValue V1 = IsZero
     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   int NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec(NumElems);
   for (int i = 0; i != NumElems; ++i)
     // If this is the insertion idx, put the low elt of V2 here.
     MaskVec[i] = (i == Idx) ? NumElems : i;
   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
 }
 
 static SDValue peekThroughBitcasts(SDValue V) {
   while (V.getNode() && V.getOpcode() == ISD::BITCAST)
     V = V.getOperand(0);
   return V;
 }
 
 static SDValue peekThroughOneUseBitcasts(SDValue V) {
   while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
          V.getOperand(0).hasOneUse())
     V = V.getOperand(0);
   return V;
 }
 
 static const Constant *getTargetConstantFromNode(SDValue Op) {
   Op = peekThroughBitcasts(Op);
 
   auto *Load = dyn_cast<LoadSDNode>(Op);
   if (!Load)
     return nullptr;
 
   SDValue Ptr = Load->getBasePtr();
   if (Ptr->getOpcode() == X86ISD::Wrapper ||
       Ptr->getOpcode() == X86ISD::WrapperRIP)
     Ptr = Ptr->getOperand(0);
 
   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
   if (!CNode || CNode->isMachineConstantPoolEntry())
     return nullptr;
 
   return dyn_cast<Constant>(CNode->getConstVal());
 }
 
 // Extract raw constant bits from constant pools.
 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
                                           APInt &UndefElts,
                                           SmallVectorImpl<APInt> &EltBits,
                                           bool AllowWholeUndefs = true,
                                           bool AllowPartialUndefs = true) {
   assert(EltBits.empty() && "Expected an empty EltBits vector");
 
   Op = peekThroughBitcasts(Op);
 
   EVT VT = Op.getValueType();
   unsigned SizeInBits = VT.getSizeInBits();
   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
   unsigned NumElts = SizeInBits / EltSizeInBits;
 
   // Bitcast a source array of element bits to the target size.
   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
            "Constant bit sizes don't match");
 
     // Don't split if we don't allow undef bits.
     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
       return false;
 
     // If we're already the right size, don't bother bitcasting.
     if (NumSrcElts == NumElts) {
       UndefElts = UndefSrcElts;
       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
       return true;
     }
 
     // Extract all the undef/constant element data and pack into single bitsets.
     APInt UndefBits(SizeInBits, 0);
     APInt MaskBits(SizeInBits, 0);
 
     for (unsigned i = 0; i != NumSrcElts; ++i) {
       unsigned BitOffset = i * SrcEltSizeInBits;
       if (UndefSrcElts[i])
         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
       MaskBits.insertBits(SrcEltBits[i], BitOffset);
     }
 
     // Split the undef/constant single bitset data into the target elements.
     UndefElts = APInt(NumElts, 0);
     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
 
     for (unsigned i = 0; i != NumElts; ++i) {
       unsigned BitOffset = i * EltSizeInBits;
       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
 
       // Only treat an element as UNDEF if all bits are UNDEF.
       if (UndefEltBits.isAllOnesValue()) {
         if (!AllowWholeUndefs)
           return false;
         UndefElts.setBit(i);
         continue;
       }
 
       // If only some bits are UNDEF then treat them as zero (or bail if not
       // supported).
       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
         return false;
 
       APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
       EltBits[i] = Bits.getZExtValue();
     }
     return true;
   };
 
   // Collect constant bits and insert into mask/undef bit masks.
   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
                                 unsigned UndefBitIndex) {
     if (!Cst)
       return false;
     if (isa<UndefValue>(Cst)) {
       Undefs.setBit(UndefBitIndex);
       return true;
     }
     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
       Mask = CInt->getValue();
       return true;
     }
     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
       Mask = CFP->getValueAPF().bitcastToAPInt();
       return true;
     }
     return false;
   };
 
   // Handle UNDEFs.
   if (Op.isUndef()) {
     APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract scalar constant bits.
   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
     APInt UndefSrcElts = APInt::getNullValue(1);
     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from build vector.
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       const SDValue &Src = Op.getOperand(i);
       if (Src.isUndef()) {
         UndefSrcElts.setBit(i);
         continue;
       }
       auto *Cst = cast<ConstantSDNode>(Src);
       SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
     }
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from constant pool vector.
   if (auto *Cst = getTargetConstantFromNode(Op)) {
     Type *CstTy = Cst->getType();
     if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
       return false;
 
     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
     unsigned NumSrcElts = CstTy->getVectorNumElements();
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
     for (unsigned i = 0; i != NumSrcElts; ++i)
       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
                                UndefSrcElts, i))
         return false;
 
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from a broadcasted constant pool scalar.
   if (Op.getOpcode() == X86ISD::VBROADCAST &&
       EltSizeInBits <= VT.getScalarSizeInBits()) {
     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
       unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
       APInt UndefSrcElts(NumSrcElts, 0);
       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
       if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
         if (UndefSrcElts[0])
           UndefSrcElts.setBits(0, NumSrcElts);
         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
         return CastBitData(UndefSrcElts, SrcEltBits);
       }
     }
   }
 
   // Extract a rematerialized scalar constant insertion.
   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits;
     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   return false;
 }
 
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
                                         SmallVectorImpl<uint64_t> &RawMask) {
   APInt UndefElts;
   SmallVector<APInt, 64> EltBits;
 
   // Extract the raw target constant bits.
   // FIXME: We currently don't support UNDEF bits or mask entries.
   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
                                      EltBits, /* AllowWholeUndefs */ false,
                                      /* AllowPartialUndefs */ false))
     return false;
 
   // Insert the extracted elements into the mask.
   for (APInt Elt : EltBits)
     RawMask.push_back(Elt.getZExtValue());
 
   return true;
 }
 
 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
 /// Note: This ignores saturation, so inputs must be checked first.
 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
                                   bool Unary) {
   assert(Mask.empty() && "Expected an empty shuffle mask vector");
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumLanes = VT.getSizeInBits() / 128;
   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
   unsigned Offset = Unary ? 0 : NumElts;
 
   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
     for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
       Mask.push_back(Elt + (Lane * NumEltsPerLane));
     for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
       Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
   }
 }
 
 /// Calculates the shuffle mask corresponding to the target-specific opcode.
 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
 /// operands in \p Ops, and returns true.
 /// Sets \p IsUnary to true if only one source is used. Note that this will set
 /// IsUnary for shuffles which use a single input multiple times, and in those
 /// cases it will adjust the mask to only have indices within that single input.
 /// It is an error to call this with non-empty Mask/Ops vectors.
 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
                                  SmallVectorImpl<SDValue> &Ops,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   SDValue ImmN;
 
   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
 
   IsUnary = false;
   bool IsFakeUnary = false;
   switch(N->getOpcode()) {
   case X86ISD::BLENDI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::INSERTPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::EXTRQI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     if (isa<ConstantSDNode>(N->getOperand(1)) &&
         isa<ConstantSDNode>(N->getOperand(2))) {
       int BitLen = N->getConstantOperandVal(1);
       int BitIdx = N->getConstantOperandVal(2);
       DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
       IsUnary = true;
     }
     break;
   case X86ISD::INSERTQI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     if (isa<ConstantSDNode>(N->getOperand(2)) &&
         isa<ConstantSDNode>(N->getOperand(3))) {
       int BitLen = N->getConstantOperandVal(2);
       int BitIdx = N->getConstantOperandVal(3);
       DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     }
     break;
   case X86ISD::UNPCKH:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeUNPCKHMask(VT, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::UNPCKL:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeUNPCKLMask(VT, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVHLPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeMOVHLPSMask(NumElems, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVLHPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeMOVLHPSMask(NumElems, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::PALIGNR:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     Ops.push_back(N->getOperand(1));
     Ops.push_back(N->getOperand(0));
     break;
   case X86ISD::VSHLDQ:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::VSRLDQ:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFHW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFLW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::VZEXT_MOVL:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     DecodeZeroMoveLowMask(VT, Mask);
     IsUnary = true;
     break;
   case X86ISD::VBROADCAST: {
     SDValue N0 = N->getOperand(0);
     // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
     // add the pre-extracted value to the Ops vector.
     if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
         N0.getOperand(0).getValueType() == VT &&
         N0.getConstantOperandVal(1) == 0)
       Ops.push_back(N0.getOperand(0));
 
     // We only decode broadcasts of same-sized vectors, unless the broadcast
     // came from an extract from the original width. If we found one, we
     // pushed it the Ops vector above.
     if (N0.getValueType() == VT || !Ops.empty()) {
       DecodeVectorBroadcast(VT, Mask);
       IsUnary = true;
       break;
     }
     return false;
   }
   case X86ISD::VPERMILPV: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     SmallVector<uint64_t, 32> RawMask;
     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
       DecodeVPERMILPMask(VT, RawMask, Mask);
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
       DecodeVPERMILPMask(C, MaskEltSize, Mask);
       break;
     }
     return false;
   }
   case X86ISD::PSHUFB: {
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
     SmallVector<uint64_t, 32> RawMask;
     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
       DecodePSHUFBMask(RawMask, Mask);
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
       DecodePSHUFBMask(C, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
     break;
   case X86ISD::VPERM2X128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVSLDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     DecodeMOVSLDUPMask(VT, Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSHDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     DecodeMOVSHDUPMask(VT, Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVDDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     DecodeMOVDDUPMask(VT, Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVLPD:
   case X86ISD::MOVLPS:
     // Not yet implemented
     return false;
   case X86ISD::VPERMIL2: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     SDValue MaskNode = N->getOperand(2);
     SDValue CtrlNode = N->getOperand(3);
     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
       unsigned CtrlImm = CtrlOp->getZExtValue();
       SmallVector<uint64_t, 32> RawMask;
       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
         DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
         break;
       }
       if (auto *C = getTargetConstantFromNode(MaskNode)) {
         DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
         break;
       }
     }
     return false;
   }
   case X86ISD::VPPERM: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     SDValue MaskNode = N->getOperand(2);
     SmallVector<uint64_t, 32> RawMask;
     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
       DecodeVPPERMMask(RawMask, Mask);
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
       DecodeVPPERMMask(C, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMV: {
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
     Ops.push_back(N->getOperand(1));
     SDValue MaskNode = N->getOperand(0);
     SmallVector<uint64_t, 32> RawMask;
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
       DecodeVPERMVMask(RawMask, Mask);
       break;
     }
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
       DecodeVPERMVMask(C, MaskEltSize, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMV3: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
     Ops.push_back(N->getOperand(0));
     Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(1);
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMIV3: {
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
     // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
     Ops.push_back(N->getOperand(1));
     Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(0);
     unsigned MaskEltSize = VT.getScalarSizeInBits();
     if (auto *C = getTargetConstantFromNode(MaskNode)) {
       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
       break;
     }
     return false;
   }
   default: llvm_unreachable("unknown target shuffle node");
   }
 
   // Empty mask indicates the decode failed.
   if (Mask.empty())
     return false;
 
   // Check if we're getting a shuffle mask with zero'd elements.
   if (!AllowSentinelZero)
     if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
       return false;
 
   // If we have a fake unary shuffle, the shuffle mask is spread across two
   // inputs that are actually the same node. Re-map the mask to always point
   // into the first input.
   if (IsFakeUnary)
     for (int &M : Mask)
       if (M >= (int)Mask.size())
         M -= Mask.size();
 
   // If we didn't already add operands in the opcode-specific code, default to
   // adding 1 or 2 operands starting at 0.
   if (Ops.empty()) {
     Ops.push_back(N->getOperand(0));
     if (!IsUnary || IsFakeUnary)
       Ops.push_back(N->getOperand(1));
   }
 
   return true;
 }
 
 /// Check a target shuffle mask's inputs to see if we can set any values to
 /// SM_SentinelZero - this is for elements that are known to be zero
 /// (not just zeroable) from their inputs.
 /// Returns true if the target shuffle mask was decoded.
 static bool setTargetShuffleZeroElements(SDValue N,
                                          SmallVectorImpl<int> &Mask,
                                          SmallVectorImpl<SDValue> &Ops) {
   bool IsUnary;
   if (!isTargetShuffle(N.getOpcode()))
     return false;
 
   MVT VT = N.getSimpleValueType();
   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
     return false;
 
   SDValue V1 = Ops[0];
   SDValue V2 = IsUnary ? V1 : Ops[1];
 
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
   assert((VT.getSizeInBits() % Mask.size()) == 0 &&
          "Illegal split of shuffle value type");
   unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
 
   // Extract known constant input data.
   APInt UndefSrcElts[2];
   SmallVector<APInt, 32> SrcEltBits[2];
   bool IsSrcConstant[2] = {
       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
                                     SrcEltBits[0], true, false),
       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
                                     SrcEltBits[1], true, false)};
 
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     int M = Mask[i];
 
     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
     if (M < 0)
       continue;
 
     // Determine shuffle input and normalize the mask.
     unsigned SrcIdx = M / Size;
     SDValue V = M < Size ? V1 : V2;
     M %= Size;
 
     // We are referencing an UNDEF input.
     if (V.isUndef()) {
       Mask[i] = SM_SentinelUndef;
       continue;
     }
 
     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
     // TODO: We currently only set UNDEF for integer types - floats use the same
     // registers as vectors and many of the scalar folded loads rely on the
     // SCALAR_TO_VECTOR pattern.
     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
         (Size % V.getValueType().getVectorNumElements()) == 0) {
       int Scale = Size / V.getValueType().getVectorNumElements();
       int Idx = M / Scale;
       if (Idx != 0 && !VT.isFloatingPoint())
         Mask[i] = SM_SentinelUndef;
       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
         Mask[i] = SM_SentinelZero;
       continue;
     }
 
     // Attempt to extract from the source's constant bits.
     if (IsSrcConstant[SrcIdx]) {
       if (UndefSrcElts[SrcIdx][M])
         Mask[i] = SM_SentinelUndef;
       else if (SrcEltBits[SrcIdx][M] == 0)
         Mask[i] = SM_SentinelZero;
     }
   }
 
   assert(VT.getVectorNumElements() == Mask.size() &&
          "Different mask size from vector size!");
   return true;
 }
 
 // Attempt to decode ops that could be represented as a shuffle mask.
 // The decoded shuffle mask may contain a different number of elements to the
 // destination value type.
 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
                                SmallVectorImpl<SDValue> &Ops,
                                SelectionDAG &DAG) {
   Mask.clear();
   Ops.clear();
 
   MVT VT = N.getSimpleValueType();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumSizeInBits = VT.getSizeInBits();
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
   assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
          "Expected byte aligned value types");
 
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
   case ISD::AND:
   case X86ISD::ANDNP: {
     // Attempt to decode as a per-byte mask.
     APInt UndefElts;
     SmallVector<APInt, 32> EltBits;
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
     bool IsAndN = (X86ISD::ANDNP == Opcode);
     uint64_t ZeroMask = IsAndN ? 255 : 0;
     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
       return false;
     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
       if (UndefElts[i]) {
         Mask.push_back(SM_SentinelUndef);
         continue;
       }
       uint64_t ByteBits = EltBits[i].getZExtValue();
       if (ByteBits != 0 && ByteBits != 255)
         return false;
       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
     }
     Ops.push_back(IsAndN ? N1 : N0);
     return true;
   }
   case ISD::SCALAR_TO_VECTOR: {
     // Match against a scalar_to_vector of an extract from a vector,
     // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
     SDValue N0 = N.getOperand(0);
     SDValue SrcExtract;
 
     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
          N0.getOperand(0).getValueType() == VT) ||
         (N0.getOpcode() == X86ISD::PEXTRW &&
          N0.getOperand(0).getValueType() == MVT::v8i16) ||
         (N0.getOpcode() == X86ISD::PEXTRB &&
          N0.getOperand(0).getValueType() == MVT::v16i8)) {
       SrcExtract = N0;
     }
 
     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
       return false;
 
     SDValue SrcVec = SrcExtract.getOperand(0);
     EVT SrcVT = SrcVec.getValueType();
     unsigned NumSrcElts = SrcVT.getVectorNumElements();
     unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
 
     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
     if (NumSrcElts <= SrcIdx)
       return false;
 
     Ops.push_back(SrcVec);
     Mask.push_back(SrcIdx);
     Mask.append(NumZeros, SM_SentinelZero);
     Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
     return true;
   }
   case X86ISD::PINSRB:
   case X86ISD::PINSRW: {
     SDValue InVec = N.getOperand(0);
     SDValue InScl = N.getOperand(1);
     uint64_t InIdx = N.getConstantOperandVal(2);
     assert(InIdx < NumElts && "Illegal insertion index");
 
     // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
     if (X86::isZeroNode(InScl)) {
       Ops.push_back(InVec);
       for (unsigned i = 0; i != NumElts; ++i)
         Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
       return true;
     }
 
     // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
     // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
     unsigned ExOp =
         (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
     if (InScl.getOpcode() != ExOp)
       return false;
 
     SDValue ExVec = InScl.getOperand(0);
     uint64_t ExIdx = InScl.getConstantOperandVal(1);
     assert(ExIdx < NumElts && "Illegal extraction index");
     Ops.push_back(InVec);
     Ops.push_back(ExVec);
     for (unsigned i = 0; i != NumElts; ++i)
       Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
     return true;
   }
   case X86ISD::PACKSS:
   case X86ISD::PACKUS: {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
            "Unexpected input value type");
 
     // If we know input saturation won't happen we can treat this
     // as a truncation shuffle.
     if (Opcode == X86ISD::PACKSS) {
       if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
           (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
         return false;
     } else {
       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
       if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
           (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
         return false;
     }
 
     bool IsUnary = (N0 == N1);
 
     Ops.push_back(N0);
     if (!IsUnary)
       Ops.push_back(N1);
 
     createPackShuffleMask(VT, Mask, IsUnary);
     return true;
   }
   case X86ISD::VSHLI:
   case X86ISD::VSRLI: {
     uint64_t ShiftVal = N.getConstantOperandVal(1);
     // Out of range bit shifts are guaranteed to be zero.
     if (NumBitsPerElt <= ShiftVal) {
       Mask.append(NumElts, SM_SentinelZero);
       return true;
     }
 
     // We can only decode 'whole byte' bit shifts as shuffles.
     if ((ShiftVal % 8) != 0)
       break;
 
     uint64_t ByteShift = ShiftVal / 8;
     unsigned NumBytes = NumSizeInBits / 8;
     unsigned NumBytesPerElt = NumBitsPerElt / 8;
     Ops.push_back(N.getOperand(0));
 
     // Clear mask to all zeros and insert the shifted byte indices.
     Mask.append(NumBytes, SM_SentinelZero);
 
     if (X86ISD::VSHLI == Opcode) {
       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
           Mask[i + j] = i + j - ByteShift;
     } else {
       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
           Mask[i + j - ByteShift] = i + j;
     }
     return true;
   }
   case ISD::ZERO_EXTEND_VECTOR_INREG:
   case X86ISD::VZEXT: {
     // TODO - add support for VPMOVZX with smaller input vector types.
     SDValue Src = N.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
     if (NumSizeInBits != SrcVT.getSizeInBits())
       break;
     DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
     Ops.push_back(Src);
     return true;
   }
   }
 
   return false;
 }
 
 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
                                               SmallVectorImpl<int> &Mask) {
   int MaskWidth = Mask.size();
   SmallVector<SDValue, 16> UsedInputs;
   for (int i = 0, e = Inputs.size(); i < e; ++i) {
     int lo = UsedInputs.size() * MaskWidth;
     int hi = lo + MaskWidth;
 
     // Strip UNDEF input usage.
     if (Inputs[i].isUndef())
       for (int &M : Mask)
         if ((lo <= M) && (M < hi))
           M = SM_SentinelUndef;
 
     // Check for unused inputs.
     if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
       UsedInputs.push_back(Inputs[i]);
       continue;
     }
     for (int &M : Mask)
       if (lo <= M)
         M -= MaskWidth;
   }
   Inputs = UsedInputs;
 }
 
 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
 /// remaining input indices in case we now have a unary shuffle and adjust the
 /// inputs accordingly.
 /// Returns true if the target shuffle mask was decoded.
 static bool resolveTargetShuffleInputs(SDValue Op,
                                        SmallVectorImpl<SDValue> &Inputs,
                                        SmallVectorImpl<int> &Mask,
                                        SelectionDAG &DAG) {
   if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
     if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
       return false;
 
   resolveTargetShuffleInputsAndMask(Inputs, Mask);
   return true;
 }
 
 /// Returns the scalar element that will make up the ith
 /// element of the result of the vector shuffle.
 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
                                    unsigned Depth) {
   if (Depth == 6)
     return SDValue();  // Limit search depth.
 
   SDValue V = SDValue(N, 0);
   EVT VT = V.getValueType();
   unsigned Opcode = V.getOpcode();
 
   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
     int Elt = SV->getMaskElt(Index);
 
     if (Elt < 0)
       return DAG.getUNDEF(VT.getVectorElementType());
 
     unsigned NumElems = VT.getVectorNumElements();
     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
                                          : SV->getOperand(1);
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   }
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
     MVT ShufVT = V.getSimpleValueType();
     MVT ShufSVT = ShufVT.getVectorElementType();
     int NumElems = (int)ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     SmallVector<SDValue, 16> ShuffleOps;
     bool IsUnary;
 
     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
       return SDValue();
 
     int Elt = ShuffleMask[Index];
     if (Elt == SM_SentinelZero)
       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
     if (Elt == SM_SentinelUndef)
       return DAG.getUNDEF(ShufSVT);
 
     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
 
   // Actual nodes that may contain scalar elements
   if (Opcode == ISD::BITCAST) {
     V = V.getOperand(0);
     EVT SrcVT = V.getValueType();
     unsigned NumElems = VT.getVectorNumElements();
 
     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
       return SDValue();
   }
 
   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
     return (Index == 0) ? V.getOperand(0)
                         : DAG.getUNDEF(VT.getVectorElementType());
 
   if (V.getOpcode() == ISD::BUILD_VECTOR)
     return V.getOperand(Index);
 
   return SDValue();
 }
 
 // Use PINSRB/PINSRW/PINSRD to create a build vector.
 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
                                         unsigned NumNonZero, unsigned NumZero,
                                         SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumElts = VT.getVectorNumElements();
   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
          "Illegal vector insertion");
 
   SDLoc dl(Op);
   SDValue V;
   bool First = true;
 
   for (unsigned i = 0; i < NumElts; ++i) {
     bool IsNonZero = (NonZeros & (1 << i)) != 0;
     if (!IsNonZero)
       continue;
 
     // If the build vector contains zeros or our first insertion is not the
     // first index then insert into zero vector to break any register
     // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
     if (First) {
       First = false;
       if (NumZero || 0 != i)
         V = getZeroVector(VT, Subtarget, DAG, dl);
       else {
         assert(0 == i && "Expected insertion into zero-index");
         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
         V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
         V = DAG.getBitcast(VT, V);
         continue;
       }
     }
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
                     DAG.getIntPtrConstant(i, dl));
   }
 
   return V;
 }
 
 /// Custom lower build_vector of v16i8.
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   if (NumNonZero > 8 && !Subtarget.hasSSE41())
     return SDValue();
 
   // SSE4.1 - use PINSRB to insert each byte directly.
   if (Subtarget.hasSSE41())
     return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
                                     Subtarget);
 
   SDLoc dl(Op);
   SDValue V;
   bool First = true;
 
   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
   for (unsigned i = 0; i < 16; ++i) {
     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
     if (ThisIsNonZero && First) {
       if (NumZero)
         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
       else
         V = DAG.getUNDEF(MVT::v8i16);
       First = false;
     }
 
     if ((i & 1) != 0) {
       // FIXME: Investigate extending to i32 instead of just i16.
       // FIXME: Investigate combining the first 4 bytes as a i32 instead.
       SDValue ThisElt, LastElt;
       bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
       if (LastIsNonZero) {
         LastElt =
             DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
       }
       if (ThisIsNonZero) {
         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
                               DAG.getConstant(8, dl, MVT::i8));
         if (LastIsNonZero)
           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
       } else
         ThisElt = LastElt;
 
       if (ThisElt) {
         if (1 == i) {
           V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
                       : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
           V = DAG.getBitcast(MVT::v8i16, V);
         } else {
           V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
                           DAG.getIntPtrConstant(i / 2, dl));
         }
       }
     }
   }
 
   return DAG.getBitcast(MVT::v16i8, V);
 }
 
 /// Custom lower build_vector of v8i16.
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   if (NumNonZero > 4 && !Subtarget.hasSSE41())
     return SDValue();
 
   // Use PINSRW to insert each byte directly.
   return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
                                   Subtarget);
 }
 
 /// Custom lower build_vector of v4i32 or v4f32.
 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   // Find all zeroable elements.
   std::bitset<4> Zeroable;
   for (int i=0; i < 4; ++i) {
     SDValue Elt = Op->getOperand(i);
     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
   }
   assert(Zeroable.size() - Zeroable.count() > 1 &&
          "We expect at least two non-zero elements!");
 
   // We only know how to deal with build_vector nodes where elements are either
   // zeroable or extract_vector_elt with constant index.
   SDValue FirstNonZero;
   unsigned FirstNonZeroIdx;
   for (unsigned i=0; i < 4; ++i) {
     if (Zeroable[i])
       continue;
     SDValue Elt = Op->getOperand(i);
     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         !isa<ConstantSDNode>(Elt.getOperand(1)))
       return SDValue();
     // Make sure that this node is extracting from a 128-bit vector.
     MVT VT = Elt.getOperand(0).getSimpleValueType();
     if (!VT.is128BitVector())
       return SDValue();
     if (!FirstNonZero.getNode()) {
       FirstNonZero = Elt;
       FirstNonZeroIdx = i;
     }
   }
 
   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
   SDValue V1 = FirstNonZero.getOperand(0);
   MVT VT = V1.getSimpleValueType();
 
   // See if this build_vector can be lowered as a blend with zero.
   SDValue Elt;
   unsigned EltMaskIdx, EltIdx;
   int Mask[4];
   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
     if (Zeroable[EltIdx]) {
       // The zero vector will be on the right hand side.
       Mask[EltIdx] = EltIdx+4;
       continue;
     }
 
     Elt = Op->getOperand(EltIdx);
     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
     EltMaskIdx = Elt.getConstantOperandVal(1);
     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
       break;
     Mask[EltIdx] = EltIdx;
   }
 
   if (EltIdx == 4) {
     // Let the shuffle legalizer deal with blend operations.
     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
     if (V1.getSimpleValueType() != VT)
       V1 = DAG.getBitcast(VT, V1);
     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
   }
 
   // See if we can lower this build_vector to a INSERTPS.
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   SDValue V2 = Elt.getOperand(0);
   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
     V1 = SDValue();
 
   bool CanFold = true;
   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
     if (Zeroable[i])
       continue;
 
     SDValue Current = Op->getOperand(i);
     SDValue SrcVector = Current->getOperand(0);
     if (!V1.getNode())
       V1 = SrcVector;
     CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
   }
 
   if (!CanFold)
     return SDValue();
 
   assert(V1.getNode() && "Expected at least two non-zero elements!");
   if (V1.getSimpleValueType() != MVT::v4f32)
     V1 = DAG.getBitcast(MVT::v4f32, V1);
   if (V2.getSimpleValueType() != MVT::v4f32)
     V2 = DAG.getBitcast(MVT::v4f32, V2);
 
   // Ok, we can emit an INSERTPS instruction.
   unsigned ZMask = Zeroable.to_ulong();
 
   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   SDLoc DL(Op);
   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                                DAG.getIntPtrConstant(InsertPSMask, DL));
   return DAG.getBitcast(VT, Result);
 }
 
 /// Return a vector logical shift node.
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
                          SelectionDAG &DAG, const TargetLowering &TLI,
                          const SDLoc &dl) {
   assert(VT.is128BitVector() && "Unknown type for VShift");
   MVT ShVT = MVT::v16i8;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getBitcast(ShVT, SrcOp);
   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
 }
 
 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
                                       SelectionDAG &DAG) {
 
   // Check if the scalar load can be widened into a vector load. And if
   // the address is "base + cst" see if the cst can be "absorbed" into
   // the shuffle mask.
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
     SDValue Ptr = LD->getBasePtr();
     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
       return SDValue();
     EVT PVT = LD->getValueType(0);
     if (PVT != MVT::i32 && PVT != MVT::f32)
       return SDValue();
 
     int FI = -1;
     int64_t Offset = 0;
     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
       FI = FINode->getIndex();
       Offset = 0;
     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
       Offset = Ptr.getConstantOperandVal(1);
       Ptr = Ptr.getOperand(0);
     } else {
       return SDValue();
     }
 
     // FIXME: 256-bit vector instructions don't require a strict alignment,
     // improve this code to support it better.
     unsigned RequiredAlign = VT.getSizeInBits()/8;
     SDValue Chain = LD->getChain();
     // Make sure the stack object alignment is at least 16 or 32.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
       if (MFI.isFixedObjectIndex(FI)) {
         // Can't change the alignment. FIXME: It's possible to compute
         // the exact stack offset and reference FI + adjust offset instead.
         // If someone *really* cares about this. That's the way to implement it.
         return SDValue();
       } else {
         MFI.setObjectAlignment(FI, RequiredAlign);
       }
     }
 
     // (Offset % 16 or 32) must be multiple of 4. Then address is then
     // Ptr + (Offset & ~15).
     if (Offset < 0)
       return SDValue();
     if ((Offset % RequiredAlign) & 3)
       return SDValue();
     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
     if (StartOffset) {
       SDLoc DL(Ptr);
       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
     }
 
     int EltNo = (Offset - StartOffset) >> 2;
     unsigned NumElems = VT.getVectorNumElements();
 
     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
                              LD->getPointerInfo().getWithOffset(StartOffset));
 
     SmallVector<int, 8> Mask(NumElems, EltNo);
 
     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
   }
 
   return SDValue();
 }
 
 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
 /// elements can be replaced by a single large load which has the same value as
 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
 ///
 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                         const SDLoc &DL, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget,
                                         bool isAfterLegalize) {
   unsigned NumElems = Elts.size();
 
   int LastLoadedElt = -1;
   SmallBitVector LoadMask(NumElems, false);
   SmallBitVector ZeroMask(NumElems, false);
   SmallBitVector UndefMask(NumElems, false);
 
   // For each element in the initializer, see if we've found a load, zero or an
   // undef.
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = peekThroughBitcasts(Elts[i]);
     if (!Elt.getNode())
       return SDValue();
 
     if (Elt.isUndef())
       UndefMask[i] = true;
     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
       ZeroMask[i] = true;
     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
       LoadMask[i] = true;
       LastLoadedElt = i;
       // Each loaded element must be the correct fractional portion of the
       // requested vector load.
       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
         return SDValue();
     } else
       return SDValue();
   }
   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
          "Incomplete element masks");
 
   // Handle Special Cases - all undef or undef/zero.
   if (UndefMask.count() == NumElems)
     return DAG.getUNDEF(VT);
 
   // FIXME: Should we return this as a BUILD_VECTOR instead?
   if ((ZeroMask | UndefMask).count() == NumElems)
     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
                           : DAG.getConstantFP(0.0, DL, VT);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   int FirstLoadedElt = LoadMask.find_first();
   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
   EVT LDBaseVT = EltBase.getValueType();
 
   // Consecutive loads can contain UNDEFS but not ZERO elements.
   // Consecutive loads with UNDEFs and ZEROs elements require a
   // an additional shuffle stage to clear the ZERO elements.
   bool IsConsecutiveLoad = true;
   bool IsConsecutiveLoadWithZeros = true;
   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
     if (LoadMask[i]) {
       SDValue Elt = peekThroughBitcasts(Elts[i]);
       LoadSDNode *LD = cast<LoadSDNode>(Elt);
       if (!DAG.areNonVolatileConsecutiveLoads(
               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
               i - FirstLoadedElt)) {
         IsConsecutiveLoad = false;
         IsConsecutiveLoadWithZeros = false;
         break;
       }
     } else if (ZeroMask[i]) {
       IsConsecutiveLoad = false;
     }
   }
 
   SmallVector<LoadSDNode *, 8> Loads;
   for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
     if (LoadMask[i])
       Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
 
   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
     auto MMOFlags = LDBase->getMemOperand()->getFlags();
     assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
            "Cannot merge volatile loads.");
     SDValue NewLd =
         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
     for (auto *LD : Loads)
       DAG.makeEquivalentMemoryOrdering(LD, NewLd);
     return NewLd;
   };
 
   // LOAD - all consecutive load/undefs (must start/end with a load).
   // If we have found an entire vector of loads and undefs, then return a large
   // load of the entire vector width starting at the base pointer.
   // If the vector contains zeros, then attempt to shuffle those elements.
   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
     assert(LDBase && "Did not find base load for merging consecutive loads");
     EVT EltVT = LDBase->getValueType(0);
     // Ensure that the input vector size for the merged loads matches the
     // cumulative size of the input elements.
     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
       return SDValue();
 
     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
       return SDValue();
 
     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
     // will lower to regular temporal loads and use the cache.
     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
         VT.is256BitVector() && !Subtarget.hasInt256())
       return SDValue();
 
     if (IsConsecutiveLoad)
       return CreateLoad(VT, LDBase);
 
     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
     // vector and a zero vector to clear out the zero elements.
     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
       SmallVector<int, 4> ClearMask(NumElems, -1);
       for (unsigned i = 0; i < NumElems; ++i) {
         if (ZeroMask[i])
           ClearMask[i] = i + NumElems;
         else if (LoadMask[i])
           ClearMask[i] = i;
       }
       SDValue V = CreateLoad(VT, LDBase);
       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
                                  : DAG.getConstantFP(0.0, DL, VT);
       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
     }
   }
 
   int LoadSize =
       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
 
   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
       (LoadSize == 32 || LoadSize == 64) &&
       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
                                       : MVT::getIntegerVT(LoadSize);
     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
     if (TLI.isTypeLegal(VecVT)) {
       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
       SDValue ResNode =
           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
                                   LDBase->getPointerInfo(),
                                   LDBase->getAlignment(),
                                   MachineMemOperand::MOLoad);
       for (auto *LD : Loads)
         DAG.makeEquivalentMemoryOrdering(LD, ResNode);
       return DAG.getBitcast(VT, ResNode);
     }
   }
 
   return SDValue();
 }
 
 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
                                    unsigned SplatBitSize, LLVMContext &C) {
   unsigned ScalarSize = VT.getScalarSizeInBits();
   unsigned NumElm = SplatBitSize / ScalarSize;
 
   SmallVector<Constant *, 32> ConstantVec;
   for (unsigned i = 0; i < NumElm; i++) {
     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
     Constant *Const;
     if (VT.isFloatingPoint()) {
       if (ScalarSize == 32) {
         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
       } else {
         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
       }
     } else
       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
     ConstantVec.push_back(Const);
   }
   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
 }
 
 static bool isUseOfShuffle(SDNode *N) {
   for (auto *U : N->uses()) {
     if (isTargetShuffle(U->getOpcode()))
       return true;
     if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
       return isUseOfShuffle(U);
   }
   return false;
 }
 
 // Check if the current node of build vector is a zero extended vector.
 // // If so, return the value extended.
 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
 // // NumElt - return the number of zero extended identical values.
 // // EltType - return the type of the value include the zero extend.
 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
                                    unsigned &NumElt, MVT &EltType) {
   SDValue ExtValue = Op->getOperand(0);
   unsigned NumElts = Op->getNumOperands();
   unsigned Delta = NumElts;
 
   for (unsigned i = 1; i < NumElts; i++) {
     if (Op->getOperand(i) == ExtValue) {
       Delta = i;
       break;
     }
     if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
       return SDValue();
   }
   if (!isPowerOf2_32(Delta) || Delta == 1)
     return SDValue();
 
   for (unsigned i = Delta; i < NumElts; i++) {
     if (i % Delta == 0) {
       if (Op->getOperand(i) != ExtValue)
         return SDValue();
     } else if (!(isNullConstant(Op->getOperand(i)) ||
                  Op->getOperand(i).isUndef()))
       return SDValue();
   }
   unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
   unsigned ExtVTSize = EltSize * Delta;
   EltType = MVT::getIntegerVT(ExtVTSize);
   NumElt = NumElts / Delta;
   return ExtValue;
 }
 
 /// Attempt to use the vbroadcast instruction to generate a splat value
 /// from a splat BUILD_VECTOR which uses:
 ///  a. A single scalar load, or a constant.
 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
 ///
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   // VBROADCAST requires AVX.
   // TODO: Splats could be generated for non-AVX CPUs using SSE
   // instructions, but there's less potential gain for only 128-bit vectors.
   if (!Subtarget.hasAVX())
     return SDValue();
 
   MVT VT = BVOp->getSimpleValueType(0);
   SDLoc dl(BVOp);
 
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Unsupported vector type for broadcast.");
 
   BitVector UndefElements;
   SDValue Ld = BVOp->getSplatValue(&UndefElements);
 
   // Attempt to use VBROADCASTM
   // From this paterrn:
   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
   // b. t1 = (build_vector t0 t0)
   //
   // Create (VBROADCASTM v2i1 X)
   if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
     MVT EltType = VT.getScalarType();
     unsigned NumElts = VT.getVectorNumElements();
     SDValue BOperand;
     SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
     if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
         (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
          Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
       if (ZeroExtended)
         BOperand = ZeroExtended.getOperand(0);
       else
         BOperand = Ld.getOperand(0).getOperand(0);
       if (BOperand.getValueType().isVector() &&
           BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
         if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
                                      NumElts == 8)) || // for broadcastmb2q
             (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
                                      NumElts == 16))) { // for broadcastmw2d
           SDValue Brdcst =
               DAG.getNode(X86ISD::VBROADCASTM, dl,
                           MVT::getVectorVT(EltType, NumElts), BOperand);
           return DAG.getBitcast(VT, Brdcst);
         }
       }
     }
   }
 
   // We need a splat of a single value to use broadcast, and it doesn't
   // make any sense if the value is only in one element of the vector.
   if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
     APInt SplatValue, Undef;
     unsigned SplatBitSize;
     bool HasUndef;
     // Check if this is a repeated constant pattern suitable for broadcasting.
     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
         SplatBitSize > VT.getScalarSizeInBits() &&
         SplatBitSize < VT.getSizeInBits()) {
       // Avoid replacing with broadcast when it's a use of a shuffle
       // instruction to preserve the present custom lowering of shuffles.
       if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
         return SDValue();
       // replace BUILD_VECTOR with broadcast of the repeated constants.
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       LLVMContext *Ctx = DAG.getContext();
       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
       if (Subtarget.hasAVX()) {
         if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
             !(SplatBitSize == 64 && Subtarget.is32Bit())) {
           // Splatted value can fit in one INTEGER constant in constant pool.
           // Load the constant and broadcast it.
           MVT CVT = MVT::getIntegerVT(SplatBitSize);
           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
           SDValue CP = DAG.getConstantPool(C, PVT);
           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
 
           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
           Ld = DAG.getLoad(
               CVT, dl, DAG.getEntryNode(), CP,
               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
               Alignment);
           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
                                        MVT::getVectorVT(CVT, Repeat), Ld);
           return DAG.getBitcast(VT, Brdcst);
         } else if (SplatBitSize == 32 || SplatBitSize == 64) {
           // Splatted value can fit in one FLOAT constant in constant pool.
           // Load the constant and broadcast it.
           // AVX have support for 32 and 64 bit broadcast for floats only.
           // No 64bit integer in 32bit subtarget.
           MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
           // Lower the splat via APFloat directly, to avoid any conversion.
           Constant *C =
               SplatBitSize == 32
                   ? ConstantFP::get(*Ctx,
                                     APFloat(APFloat::IEEEsingle(), SplatValue))
                   : ConstantFP::get(*Ctx,
                                     APFloat(APFloat::IEEEdouble(), SplatValue));
           SDValue CP = DAG.getConstantPool(C, PVT);
           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
 
           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
           Ld = DAG.getLoad(
               CVT, dl, DAG.getEntryNode(), CP,
               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
               Alignment);
           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
                                        MVT::getVectorVT(CVT, Repeat), Ld);
           return DAG.getBitcast(VT, Brdcst);
         } else if (SplatBitSize > 64) {
           // Load the vector of constants and broadcast it.
           MVT CVT = VT.getScalarType();
           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
                                              *Ctx);
           SDValue VCP = DAG.getConstantPool(VecC, PVT);
           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
           unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
           Ld = DAG.getLoad(
               MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
               Alignment);
           SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
           return DAG.getBitcast(VT, Brdcst);
         }
       }
     }
     return SDValue();
   }
 
   bool ConstSplatVal =
       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
 
   // Make sure that all of the users of a non-constant load are from the
   // BUILD_VECTOR node.
   if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
     return SDValue();
 
   unsigned ScalarSize = Ld.getValueSizeInBits();
   bool IsGE256 = (VT.getSizeInBits() >= 256);
 
   // When optimizing for size, generate up to 5 extra bytes for a broadcast
   // instruction to save 8 or more bytes of constant pool data.
   // TODO: If multiple splats are generated to load the same constant,
   // it may be detrimental to overall size. There needs to be a way to detect
   // that condition to know if this is truly a size win.
   bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
 
   // Handle broadcasting a single constant scalar from the constant pool
   // into a vector.
   // On Sandybridge (no AVX2), it is still better to load a constant vector
   // from the constant pool and not to broadcast it from a scalar.
   // But override that restriction when optimizing for size.
   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
     EVT CVT = Ld.getValueType();
     assert(!CVT.isVector() && "Must not broadcast a vector type");
 
     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
     // For size optimization, also splat v2f64 and v2i64, and for size opt
     // with AVX2, also splat i8 and i16.
     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
         C = CF->getConstantFPValue();
 
       assert(C && "Invalid constant type");
 
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       SDValue CP =
           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(
           CVT, dl, DAG.getEntryNode(), CP,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
           Alignment);
 
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
     }
   }
 
   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
 
   // Handle AVX2 in-register broadcasts.
   if (!IsLoad && Subtarget.hasInt256() &&
       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The scalar source must be a normal load.
   if (!IsLoad)
     return SDValue();
 
   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
       (Subtarget.hasVLX() && ScalarSize == 64))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   // double since there is no vbroadcastsd xmm
   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   }
 
   // Unsupported broadcast.
   return SDValue();
 }
 
 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
 /// underlying vector and index.
 ///
 /// Modifies \p ExtractedFromVec to the real vector and returns the real
 /// index.
 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
                                          SDValue ExtIdx) {
   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
     return Idx;
 
   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
   // lowered this:
   //   (extract_vector_elt (v8f32 %1), Constant<6>)
   // to:
   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
   //                           (extract_subvector (v8f32 %0), Constant<4>),
   //                           undef)
   //                       Constant<0>)
   // In this case the vector is the extract_subvector expression and the index
   // is 2, as specified by the shuffle.
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
   SDValue ShuffleVec = SVOp->getOperand(0);
   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
   assert(ShuffleVecVT.getVectorElementType() ==
          ExtractedFromVec.getSimpleValueType().getVectorElementType());
 
   int ShuffleIdx = SVOp->getMaskElt(Idx);
   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
     ExtractedFromVec = ShuffleVec;
     return ShuffleIdx;
   }
   return Idx;
 }
 
 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   // Skip if insert_vec_elt is not supported.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
     return SDValue();
 
   SDLoc DL(Op);
   unsigned NumElems = Op.getNumOperands();
 
   SDValue VecIn1;
   SDValue VecIn2;
   SmallVector<unsigned, 4> InsertIndices;
   SmallVector<int, 8> Mask(NumElems, -1);
 
   for (unsigned i = 0; i != NumElems; ++i) {
     unsigned Opc = Op.getOperand(i).getOpcode();
 
     if (Opc == ISD::UNDEF)
       continue;
 
     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
       // Quit if more than 1 elements need inserting.
       if (InsertIndices.size() > 1)
         return SDValue();
 
       InsertIndices.push_back(i);
       continue;
     }
 
     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
 
     // Quit if non-constant index.
     if (!isa<ConstantSDNode>(ExtIdx))
       return SDValue();
     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
 
     // Quit if extracted from vector of different type.
     if (ExtractedFromVec.getValueType() != VT)
       return SDValue();
 
     if (!VecIn1.getNode())
       VecIn1 = ExtractedFromVec;
     else if (VecIn1 != ExtractedFromVec) {
       if (!VecIn2.getNode())
         VecIn2 = ExtractedFromVec;
       else if (VecIn2 != ExtractedFromVec)
         // Quit if more than 2 vectors to shuffle
         return SDValue();
     }
 
     if (ExtractedFromVec == VecIn1)
       Mask[i] = Idx;
     else if (ExtractedFromVec == VecIn2)
       Mask[i] = Idx + NumElems;
   }
 
   if (!VecIn1.getNode())
     return SDValue();
 
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
 
   for (unsigned Idx : InsertIndices)
     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
                      DAG.getIntPtrConstant(Idx, DL));
 
   return NV;
 }
 
 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
          Op.getScalarValueSizeInBits() == 1 &&
          "Can not convert non-constant vector");
   uint64_t Immediate = 0;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (!In.isUndef())
       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
   }
   SDLoc dl(Op);
   MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
   return DAG.getConstant(Immediate, dl, VT);
 }
 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
 
   MVT VT = Op.getSimpleValueType();
   assert((VT.getVectorElementType() == MVT::i1) &&
          "Unexpected type in LowerBUILD_VECTORvXi1!");
 
   SDLoc dl(Op);
   if (ISD::isBuildVectorAllZeros(Op.getNode()))
     return Op;
 
   if (ISD::isBuildVectorAllOnes(Op.getNode()))
     return Op;
 
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
       // Split the pieces.
       SDValue Lower =
           DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
       SDValue Upper =
           DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
       // We have to manually lower both halves so getNode doesn't try to
       // reassemble the build_vector.
       Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
       Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
     }
     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
       return DAG.getBitcast(VT, Imm);
     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                         DAG.getIntPtrConstant(0, dl));
   }
 
   // Vector has one or more non-const elements
   uint64_t Immediate = 0;
   SmallVector<unsigned, 16> NonConstIdx;
   bool IsSplat = true;
   bool HasConstElts = false;
   int SplatIdx = -1;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (In.isUndef())
       continue;
     if (!isa<ConstantSDNode>(In))
       NonConstIdx.push_back(idx);
     else {
       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
       HasConstElts = true;
     }
     if (SplatIdx < 0)
       SplatIdx = idx;
     else if (In != Op.getOperand(SplatIdx))
       IsSplat = false;
   }
 
   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
   if (IsSplat)
     return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
                          DAG.getConstant(1, dl, VT),
                          DAG.getConstant(0, dl, VT));
 
   // insert elements one by one
   SDValue DstVec;
   SDValue Imm;
   if (Immediate) {
     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
     Imm = DAG.getConstant(Immediate, dl, ImmVT);
   }
   else if (HasConstElts)
     Imm = DAG.getConstant(0, dl, VT);
   else
     Imm = DAG.getUNDEF(VT);
   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
     DstVec = DAG.getBitcast(VT, Imm);
   else {
     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                          DAG.getIntPtrConstant(0, dl));
   }
 
   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
     unsigned InsertIdx = NonConstIdx[i];
     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                          Op.getOperand(InsertIdx),
                          DAG.getIntPtrConstant(InsertIdx, dl));
   }
   return DstVec;
 }
 
 /// \brief Return true if \p N implements a horizontal binop and return the
 /// operands for the horizontal binop into V0 and V1.
 ///
 /// This is a helper function of LowerToHorizontalOp().
 /// This function checks that the build_vector \p N in input implements a
 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
 /// operation to match.
 /// For example, if \p Opcode is equal to ISD::ADD, then this function
 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
 /// is equal to ISD::SUB, then this function checks if this is a horizontal
 /// arithmetic sub.
 ///
 /// This function only analyzes elements of \p N whose indices are
 /// in range [BaseIdx, LastIdx).
 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
                               SelectionDAG &DAG,
                               unsigned BaseIdx, unsigned LastIdx,
                               SDValue &V0, SDValue &V1) {
   EVT VT = N->getValueType(0);
 
   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
          "Invalid Vector in input!");
 
   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
   bool CanFold = true;
   unsigned ExpectedVExtractIdx = BaseIdx;
   unsigned NumElts = LastIdx - BaseIdx;
   V0 = DAG.getUNDEF(VT);
   V1 = DAG.getUNDEF(VT);
 
   // Check if N implements a horizontal binop.
   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
     SDValue Op = N->getOperand(i + BaseIdx);
 
     // Skip UNDEFs.
     if (Op->isUndef()) {
       // Update the expected vector extract index.
       if (i * 2 == NumElts)
         ExpectedVExtractIdx = BaseIdx;
       ExpectedVExtractIdx += 2;
       continue;
     }
 
     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
 
     if (!CanFold)
       break;
 
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
     // Try to match the following pattern:
     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op0.getOperand(0) == Op1.getOperand(0) &&
         isa<ConstantSDNode>(Op0.getOperand(1)) &&
         isa<ConstantSDNode>(Op1.getOperand(1)));
     if (!CanFold)
       break;
 
     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
 
     if (i * 2 < NumElts) {
       if (V0.isUndef()) {
         V0 = Op0.getOperand(0);
         if (V0.getValueType() != VT)
           return false;
       }
     } else {
       if (V1.isUndef()) {
         V1 = Op0.getOperand(0);
         if (V1.getValueType() != VT)
           return false;
       }
       if (i * 2 == NumElts)
         ExpectedVExtractIdx = BaseIdx;
     }
 
     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
     if (I0 == ExpectedVExtractIdx)
       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
       // Try to match the following dag sequence:
       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
     } else
       CanFold = false;
 
     ExpectedVExtractIdx += 2;
   }
 
   return CanFold;
 }
 
 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
 /// a concat_vector.
 ///
 /// This is a helper function of LowerToHorizontalOp().
 /// This function expects two 256-bit vectors called V0 and V1.
 /// At first, each vector is split into two separate 128-bit vectors.
 /// Then, the resulting 128-bit vectors are used to implement two
 /// horizontal binary operations.
 ///
 /// The kind of horizontal binary operation is defined by \p X86Opcode.
 ///
 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
 /// the two new horizontal binop.
 /// When Mode is set, the first horizontal binop dag node would take as input
 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
 /// horizontal binop dag node would take as input the lower 128-bit of V1
 /// and the upper 128-bit of V1.
 ///   Example:
 ///     HADD V0_LO, V0_HI
 ///     HADD V1_LO, V1_HI
 ///
 /// Otherwise, the first horizontal binop dag node takes as input the lower
 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
 ///   Example:
 ///     HADD V0_LO, V1_LO
 ///     HADD V0_HI, V1_HI
 ///
 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
 /// the upper 128-bits of the result.
 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
                                      const SDLoc &DL, SelectionDAG &DAG,
                                      unsigned X86Opcode, bool Mode,
                                      bool isUndefLO, bool isUndefHI) {
   MVT VT = V0.getSimpleValueType();
   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
          "Invalid nodes in input!");
 
   unsigned NumElts = VT.getVectorNumElements();
   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
   MVT NewVT = V0_LO.getSimpleValueType();
 
   SDValue LO = DAG.getUNDEF(NewVT);
   SDValue HI = DAG.getUNDEF(NewVT);
 
   if (Mode) {
     // Don't emit a horizontal binop if the result is expected to be UNDEF.
     if (!isUndefLO && !V0->isUndef())
       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
     if (!isUndefHI && !V1->isUndef())
       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
   } else {
     // Don't emit a horizontal binop if the result is expected to be UNDEF.
     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
 
     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
   }
 
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
 }
 
 /// Returns true iff \p BV builds a vector with the result equivalent to
 /// the result of ADDSUB operation.
 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
 /// are written to the parameters \p Opnd0 and \p Opnd1.
 static bool isAddSub(const BuildVectorSDNode *BV,
                      const X86Subtarget &Subtarget, SelectionDAG &DAG,
                      SDValue &Opnd0, SDValue &Opnd1,
                      unsigned &NumExtracts) {
 
   MVT VT = BV->getSimpleValueType(0);
   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
       (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   SDValue InVec0 = DAG.getUNDEF(VT);
   SDValue InVec1 = DAG.getUNDEF(VT);
 
   NumExtracts = 0;
 
   // Odd-numbered elements in the input build vector are obtained from
   // adding two integer/float elements.
   // Even-numbered elements in the input build vector are obtained from
   // subtracting two integer/float elements.
   unsigned ExpectedOpcode = ISD::FSUB;
   unsigned NextExpectedOpcode = ISD::FADD;
   bool AddFound = false;
   bool SubFound = false;
 
   for (unsigned i = 0, e = NumElts; i != e; ++i) {
     SDValue Op = BV->getOperand(i);
 
     // Skip 'undef' values.
     unsigned Opcode = Op.getOpcode();
     if (Opcode == ISD::UNDEF) {
       std::swap(ExpectedOpcode, NextExpectedOpcode);
       continue;
     }
 
     // Early exit if we found an unexpected opcode.
     if (Opcode != ExpectedOpcode)
       return false;
 
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
     // Try to match the following pattern:
     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
     // Early exit if we cannot match that sequence.
     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
         Op0.getOperand(1) != Op1.getOperand(1))
       return false;
 
     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
     if (I0 != i)
       return false;
 
     // We found a valid add/sub node. Update the information accordingly.
     if (i & 1)
       AddFound = true;
     else
       SubFound = true;
 
     // Update InVec0 and InVec1.
     if (InVec0.isUndef()) {
       InVec0 = Op0.getOperand(0);
       if (InVec0.getSimpleValueType() != VT)
         return false;
     }
     if (InVec1.isUndef()) {
       InVec1 = Op1.getOperand(0);
       if (InVec1.getSimpleValueType() != VT)
         return false;
     }
 
     // Make sure that operands in input to each add/sub node always
     // come from a same pair of vectors.
     if (InVec0 != Op0.getOperand(0)) {
       if (ExpectedOpcode == ISD::FSUB)
         return false;
 
       // FADD is commutable. Try to commute the operands
       // and then test again.
       std::swap(Op0, Op1);
       if (InVec0 != Op0.getOperand(0))
         return false;
     }
 
     if (InVec1 != Op1.getOperand(0))
       return false;
 
     // Update the pair of expected opcodes.
     std::swap(ExpectedOpcode, NextExpectedOpcode);
 
     // Increment the number of extractions done.
     ++NumExtracts;
   }
 
   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
   if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
     return false;
 
   Opnd0 = InVec0;
   Opnd1 = InVec1;
   return true;
 }
 
 /// Returns true if is possible to fold MUL and an idiom that has already been
 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
 ///
 /// Prior to calling this function it should be known that there is some
 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
 /// before replacement of such SDNode with ADDSUB operation. Thus the number
 /// of \p Opnd0 uses is expected to be equal to 2.
 /// For example, this function may be called for the following IR:
 ///    %AB = fmul fast <2 x double> %A, %B
 ///    %Sub = fsub fast <2 x double> %AB, %C
 ///    %Add = fadd fast <2 x double> %AB, %C
 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
 ///                            <2 x i32> <i32 0, i32 3>
 /// There is a def for %Addsub here, which potentially can be replaced by
 /// X86ISD::ADDSUB operation:
 ///    %Addsub = X86ISD::ADDSUB %AB, %C
 /// and such ADDSUB can further be replaced with FMADDSUB:
 ///    %Addsub = FMADDSUB %A, %B, %C.
 ///
 /// The main reason why this method is called before the replacement of the
 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
 /// FMADDSUB is.
 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG,
                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
                                  unsigned ExpectedUses) {
   if (Opnd0.getOpcode() != ISD::FMUL ||
       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
     return false;
 
   // FIXME: These checks must match the similar ones in
   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
   // or MUL + ADDSUB to FMADDSUB.
   const TargetOptions &Options = DAG.getTarget().Options;
   bool AllowFusion =
       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
   if (!AllowFusion)
     return false;
 
   Opnd2 = Opnd1;
   Opnd1 = Opnd0.getOperand(1);
   Opnd0 = Opnd0.getOperand(0);
 
   return true;
 }
 
 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   SDValue Opnd0, Opnd1;
   unsigned NumExtracts;
   if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
     return SDValue();
 
   MVT VT = BV->getSimpleValueType(0);
   SDLoc DL(BV);
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
   // TODO: According to coverage reports, the FMADDSUB transform is not
   // triggered by any tests.
   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
 
   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
   // the ADDSUB idiom has been successfully recognized. There are no known
   // X86 targets with 512-bit ADDSUB instructions!
   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
   // recognition.
   if (VT.is512BitVector())
     return SDValue();
 
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT VT = BV->getSimpleValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumUndefsLO = 0;
   unsigned NumUndefsHI = 0;
   unsigned Half = NumElts/2;
 
   // Count the number of UNDEF operands in the build_vector in input.
   for (unsigned i = 0, e = Half; i != e; ++i)
     if (BV->getOperand(i)->isUndef())
       NumUndefsLO++;
 
   for (unsigned i = Half, e = NumElts; i != e; ++i)
     if (BV->getOperand(i)->isUndef())
       NumUndefsHI++;
 
   // Early exit if this is either a build_vector of all UNDEFs or all the
   // operands but one are UNDEF.
   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
     return SDValue();
 
   SDLoc DL(BV);
   SDValue InVec0, InVec1;
   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
     // Try to match an SSE3 float HADD/HSUB.
     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
 
     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
     // Try to match an SSSE3 integer HADD/HSUB.
     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
 
     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
   }
 
   if (!Subtarget.hasAVX())
     return SDValue();
 
   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
     // Try to match an AVX horizontal add/sub of packed single/double
     // precision floating point values from 256-bit vectors.
     SDValue InVec2, InVec3;
     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
 
     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
     // Try to match an AVX2 horizontal add/sub of signed integers.
     SDValue InVec2, InVec3;
     unsigned X86Opcode;
     bool CanFold = true;
 
     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       X86Opcode = X86ISD::HADD;
     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       X86Opcode = X86ISD::HSUB;
     else
       CanFold = false;
 
     if (CanFold) {
       // Fold this build_vector into a single horizontal add/sub.
       // Do this only if the target has AVX2.
       if (Subtarget.hasAVX2())
         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
 
       // Do not try to expand this build_vector into a pair of horizontal
       // add/sub if we can emit a pair of scalar add/sub.
       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
         return SDValue();
 
       // Convert this build_vector into a pair of horizontal binop followed by
       // a concat vector.
       bool isUndefLO = NumUndefsLO == Half;
       bool isUndefHI = NumUndefsHI == Half;
       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
                                    isUndefLO, isUndefHI);
     }
   }
 
   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
        VT == MVT::v16i16) && Subtarget.hasAVX()) {
     unsigned X86Opcode;
     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
       X86Opcode = X86ISD::HADD;
     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
       X86Opcode = X86ISD::HSUB;
     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
       X86Opcode = X86ISD::FHADD;
     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
       X86Opcode = X86ISD::FHSUB;
     else
       return SDValue();
 
     // Don't try to expand this build_vector into a pair of horizontal add/sub
     // if we can simply emit a pair of scalar add/sub.
     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
       return SDValue();
 
     // Convert this build_vector into two horizontal add/sub followed by
     // a concat vector.
     bool isUndefLO = NumUndefsLO == Half;
     bool isUndefHI = NumUndefsHI == Half;
     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
                                  isUndefLO, isUndefHI);
   }
 
   return SDValue();
 }
 
 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
 /// just apply the bit to the vectors.
 /// NOTE: Its not in our interest to start make a general purpose vectorizer
 /// from this, but enough scalar bit operations are created from the later
 /// legalization + scalarization stages to need basic support.
 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
                                        SelectionDAG &DAG) {
   SDLoc DL(Op);
   MVT VT = Op->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Check that all elements have the same opcode.
   // TODO: Should we allow UNDEFS and if so how many?
   unsigned Opcode = Op->getOperand(0).getOpcode();
   for (unsigned i = 1; i < NumElems; ++i)
     if (Opcode != Op->getOperand(i).getOpcode())
       return SDValue();
 
   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
   switch (Opcode) {
   default:
     return SDValue();
   case ISD::AND:
   case ISD::XOR:
   case ISD::OR:
     // Don't do this if the buildvector is a splat - we'd replace one
     // constant with an entire vector.
     if (Op->getSplatValue())
       return SDValue();
     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
       return SDValue();
     break;
   }
 
   SmallVector<SDValue, 4> LHSElts, RHSElts;
   for (SDValue Elt : Op->ops()) {
     SDValue LHS = Elt.getOperand(0);
     SDValue RHS = Elt.getOperand(1);
 
     // We expect the canonicalized RHS operand to be the constant.
     if (!isa<ConstantSDNode>(RHS))
       return SDValue();
     LHSElts.push_back(LHS);
     RHSElts.push_back(RHS);
   }
 
   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
 }
 
 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
 /// functionality to do this, so it's all zeros, all ones, or some derivation
 /// that is cheap to calculate.
 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
 
   // Vectors containing all zeros can be matched by pxor and xorps.
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
       return Op;
 
     return getZeroVector(VT, Subtarget, DAG, DL);
   }
 
   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   // vpcmpeqd on 256-bit vectors.
   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
         (VT == MVT::v8i32 && Subtarget.hasInt256()))
       return Op;
 
     return getOnesVector(VT, DAG, DL);
   }
 
   return SDValue();
 }
 
 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
 // reasoned to be a permutation of a vector by indices in a non-constant vector.
 // (build_vector (extract_elt V, (extract_elt I, 0)),
 //               (extract_elt V, (extract_elt I, 1)),
 //                    ...
 // ->
 // (vpermv I, V)
 //
 // TODO: Handle undefs
 // TODO: Utilize pshufb and zero mask blending to support more efficient
 // construction of vectors with constant-0 elements.
 // TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
 // when no native operation available.
 static SDValue
 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   // Look for VPERMV and PSHUFB opportunities.
   MVT VT = V.getSimpleValueType();
   switch (VT.SimpleTy) {
   default:
     return SDValue();
   case MVT::v16i8:
     if (!Subtarget.hasSSE3())
       return SDValue();
     break;
   case MVT::v8f32:
   case MVT::v8i32:
     if (!Subtarget.hasAVX2())
       return SDValue();
     break;
   case MVT::v4i64:
   case MVT::v4f64:
     if (!Subtarget.hasVLX())
       return SDValue();
     break;
   case MVT::v16f32:
   case MVT::v8f64:
   case MVT::v16i32:
   case MVT::v8i64:
     if (!Subtarget.hasAVX512())
       return SDValue();
     break;
   case MVT::v32i16:
     if (!Subtarget.hasBWI())
       return SDValue();
     break;
   case MVT::v8i16:
   case MVT::v16i16:
     if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
       return SDValue();
     break;
   case MVT::v64i8:
     if (!Subtarget.hasVBMI())
       return SDValue();
     break;
   case MVT::v32i8:
     if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
       return SDValue();
     break;
   }
   SDValue SrcVec, IndicesVec;
   // Check for a match of the permute source vector and permute index elements.
   // This is done by checking that the i-th build_vector operand is of the form:
   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
     SDValue Op = V.getOperand(Idx);
     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
 
     // If this is the first extract encountered in V, set the source vector,
     // otherwise verify the extract is from the previously defined source
     // vector.
     if (!SrcVec)
       SrcVec = Op.getOperand(0);
     else if (SrcVec != Op.getOperand(0))
       return SDValue();
     SDValue ExtractedIndex = Op->getOperand(1);
     // Peek through extends.
     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
       ExtractedIndex = ExtractedIndex.getOperand(0);
     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
 
     // If this is the first extract from the index vector candidate, set the
     // indices vector, otherwise verify the extract is from the previously
     // defined indices vector.
     if (!IndicesVec)
       IndicesVec = ExtractedIndex.getOperand(0);
     else if (IndicesVec != ExtractedIndex.getOperand(0))
       return SDValue();
 
     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
     if (!PermIdx || PermIdx->getZExtValue() != Idx)
       return SDValue();
   }
   MVT IndicesVT = VT;
   if (VT.isFloatingPoint())
     IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
                                  VT.getVectorNumElements());
   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
   if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
     SrcVec =
         DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
                     SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
   }
   if (VT == MVT::v16i8)
     return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
   return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
 }
 
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   MVT VT = Op.getSimpleValueType();
   MVT ExtVT = VT.getVectorElementType();
   unsigned NumElems = Op.getNumOperands();
 
   // Generate vectors for predicate vectors.
   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
 
   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
     return VectorConstant;
 
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
   // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
   // transform here.
   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
     return AddSub;
   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
     return HorizontalOp;
   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
     return Broadcast;
   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
     return BitOp;
 
   unsigned EVTBits = ExtVT.getSizeInBits();
 
   unsigned NumZero  = 0;
   unsigned NumNonZero = 0;
   uint64_t NonZeros = 0;
   bool IsAllConstants = true;
   SmallSet<SDValue, 8> Values;
   unsigned NumConstants = NumElems;
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = Op.getOperand(i);
     if (Elt.isUndef())
       continue;
     Values.insert(Elt);
     if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
       IsAllConstants = false;
       NumConstants--;
     }
     if (X86::isZeroNode(Elt))
       NumZero++;
     else {
       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
       NonZeros |= ((uint64_t)1 << i);
       NumNonZero++;
     }
   }
 
   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   if (NumNonZero == 0)
     return DAG.getUNDEF(VT);
 
   // If we are inserting one variable into a vector of non-zero constants, try
   // to avoid loading each constant element as a scalar. Load the constants as a
   // vector and then insert the variable scalar element. If insertion is not
   // supported, we assume that we will fall back to a shuffle to get the scalar
   // blended with the constants. Insertion into a zero vector is handled as a
   // special-case somewhere below here.
   LLVMContext &Context = *DAG.getContext();
   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
     // Create an all-constant vector. The variable element in the old
     // build vector is replaced by undef in the constant vector. Save the
     // variable scalar element and its index for use in the insertelement.
     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
     SDValue VarElt;
     SDValue InsIndex;
     for (unsigned i = 0; i != NumElems; ++i) {
       SDValue Elt = Op.getOperand(i);
       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
       else if (!Elt.isUndef()) {
         assert(!VarElt.getNode() && !InsIndex.getNode() &&
                "Expected one variable element in this vector");
         VarElt = Elt;
         InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
       }
     }
     Constant *CV = ConstantVector::get(ConstVecOps);
     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
 
     // The constants we just created may not be legal (eg, floating point). We
     // must lower the vector right here because we can not guarantee that we'll
     // legalize it before loading it. This is also why we could not just create
     // a new build vector here. If the build vector contains illegal constants,
     // it could get split back up into a series of insert elements.
     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
     MachineFunction &MF = DAG.getMachineFunction();
     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
   }
 
   // Special case for single non-zero, non-undef, element.
   if (NumNonZero == 1) {
     unsigned Idx = countTrailingZeros(NonZeros);
     SDValue Item = Op.getOperand(Idx);
 
     // If this is an insertion of an i64 value on x86-32, and if the top bits of
     // the value are obviously zero, truncate the value to i32 and do the
     // insertion that way.  Only do this if the value is non-constant or if the
     // value is a constant being inserted into element 0.  It is cheaper to do
     // a constant pool load than it is to do a movd + shuffle.
     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
         (!IsAllConstants || Idx == 0)) {
       if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
         // Handle SSE only.
         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
         MVT VecVT = MVT::v4i32;
 
         // Truncate the value (which may itself be a constant) to i32, and
         // convert it to a vector with movd (S2V+shuffle to zero extend).
         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
                                       Item, Idx * 2, true, Subtarget, DAG));
       }
     }
 
     // If we have a constant or non-constant insertion into the low element of
     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
     // depending on what the source datatype is.
     if (Idx == 0) {
       if (NumZero == 0)
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
           (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
         assert((VT.is128BitVector() || VT.is256BitVector() ||
                 VT.is512BitVector()) &&
                "Expected an SSE value type!");
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
       }
 
       // We can't directly insert an i8 or i16 into a vector, so zero extend
       // it to i32 first.
       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
         if (VT.getSizeInBits() >= 256) {
           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
           if (Subtarget.hasAVX()) {
             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
           } else {
             // Without AVX, we need to extend to a 128-bit vector and then
             // insert into the 256-bit vector.
             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
           }
         } else {
           assert(VT.is128BitVector() && "Expected an SSE value type!");
           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
         }
         return DAG.getBitcast(VT, Item);
       }
     }
 
     // Is it a vector logical left shift?
     if (NumElems == 2 && Idx == 1 &&
         X86::isZeroNode(Op.getOperand(0)) &&
         !X86::isZeroNode(Op.getOperand(1))) {
       unsigned NumBits = VT.getSizeInBits();
       return getVShift(true, VT,
                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                    VT, Op.getOperand(1)),
                        NumBits/2, DAG, *this, dl);
     }
 
     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
       return SDValue();
 
     // Otherwise, if this is a vector with i32 or f32 elements, and the element
     // is a non-constant being inserted into an element other than the low one,
     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
     // movd/movss) to move this into the low element, then shuffle it into
     // place.
     if (EVTBits == 32) {
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
     }
   }
 
   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   if (Values.size() == 1) {
     if (EVTBits == 32) {
       // Instead of a shuffle like this:
       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
       // Check if it's possible to issue this instead.
       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
       unsigned Idx = countTrailingZeros(NonZeros);
       SDValue Item = Op.getOperand(Idx);
       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
     }
     return SDValue();
   }
 
   // A vector full of immediates; various special cases are already
   // handled, so this is best done with a single constant-pool load.
   if (IsAllConstants)
     return SDValue();
 
   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
       return V;
 
   // See if we can use a vector load to get all of the elements.
   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
     if (SDValue LD =
             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
       return LD;
   }
 
   // For AVX-length vectors, build the individual 128-bit pieces and use
   // shuffles to put them in place.
   if (VT.is256BitVector() || VT.is512BitVector()) {
     EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
 
     // Build both the lower and upper subvector.
     SDValue Lower =
         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
     SDValue Upper = DAG.getBuildVector(
         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
 
     // Recreate the wider vector with the lower and upper part.
     if (VT.is256BitVector())
       return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
     return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
   }
 
   // Let legalizer expand 2-wide build_vectors.
   if (EVTBits == 64) {
     if (NumNonZero == 1) {
       // One half is zero or undef.
       unsigned Idx = countTrailingZeros(NonZeros);
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
     }
     return SDValue();
   }
 
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16)
     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
                                           DAG, Subtarget))
       return V;
 
   if (EVTBits == 16 && NumElems == 8)
     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
                                           DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   if (EVTBits == 32 && NumElems == 4)
     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits, turn it into a number of shuffles.
   if (NumElems == 4 && NumZero > 0) {
     SmallVector<SDValue, 8> Ops(NumElems);
     for (unsigned i = 0; i < 4; ++i) {
       bool isZero = !(NonZeros & (1ULL << i));
       if (isZero)
         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
       else
         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
     }
 
     for (unsigned i = 0; i < 2; ++i) {
       switch ((NonZeros >> (i*2)) & 0x3) {
         default: llvm_unreachable("Unexpected NonZero count");
         case 0:
           Ops[i] = Ops[i*2];  // Must be a zero vector.
           break;
         case 1:
           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
           break;
         case 2:
           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
           break;
         case 3:
           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
           break;
       }
     }
 
     bool Reverse1 = (NonZeros & 0x3) == 2;
     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
     int MaskVec[] = {
       Reverse1 ? 1 : 0,
       Reverse1 ? 0 : 1,
       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
     };
     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
   }
 
   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
 
   // Check for a build vector from mostly shuffle plus few inserting.
   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
     return Sh;
 
   // For SSE 4.1, use insertps to put the high elements into the low element.
   if (Subtarget.hasSSE41()) {
     SDValue Result;
     if (!Op.getOperand(0).isUndef())
       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
     else
       Result = DAG.getUNDEF(VT);
 
     for (unsigned i = 1; i < NumElems; ++i) {
       if (Op.getOperand(i).isUndef()) continue;
       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
     }
     return Result;
   }
 
   // Otherwise, expand into a number of unpckl*, start by extending each of
   // our (non-undef) elements to the full vector width with the element in the
   // bottom slot of the vector (which generates no code for SSE).
   SmallVector<SDValue, 8> Ops(NumElems);
   for (unsigned i = 0; i < NumElems; ++i) {
     if (!Op.getOperand(i).isUndef())
       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
     else
       Ops[i] = DAG.getUNDEF(VT);
   }
 
   // Next, we iteratively mix elements, e.g. for v4f32:
   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
     // Generate scaled UNPCKL shuffle mask.
     SmallVector<int, 16> Mask;
     for(unsigned i = 0; i != Scale; ++i)
       Mask.push_back(i);
     for (unsigned i = 0; i != Scale; ++i)
       Mask.push_back(NumElems+i);
     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
 
     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
   }
   return Ops[0];
 }
 
 // 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
 
   assert((ResVT.is256BitVector() ||
           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
 
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   unsigned NumElems = ResVT.getVectorNumElements();
   if (ResVT.is256BitVector())
     return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 
   if (Op.getNumOperands() == 4) {
     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                   ResVT.getVectorNumElements()/2);
     SDValue V3 = Op.getOperand(2);
     SDValue V4 = Op.getOperand(3);
     return concat256BitVectors(
         concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
         concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
         NumElems, DAG, dl);
   }
   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
 static bool isExpandWithZeros(const SDValue &Op) {
   assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
          "Expand with zeros only possible in CONCAT_VECTORS nodes!");
 
   for (unsigned i = 1; i < Op.getNumOperands(); i++)
     if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
       return false;
 
   return true;
 }
 
 // Returns true if the given node is a type promotion (by concatenating i1
 // zeros) of the result of a node that already zeros all upper bits of
 // k-register.
 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
   unsigned Opc = Op.getOpcode();
 
   assert(Opc == ISD::CONCAT_VECTORS &&
          Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
          "Unexpected node to check for type promotion!");
 
   // As long as we are concatenating zeros to the upper part of a previous node
   // result, climb up the tree until a node with different opcode is
   // encountered
   while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
     if (Opc == ISD::INSERT_SUBVECTOR) {
       if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
           Op.getConstantOperandVal(2) == 0)
         Op = Op.getOperand(1);
       else
         return SDValue();
     } else { // Opc == ISD::CONCAT_VECTORS
       if (isExpandWithZeros(Op))
         Op = Op.getOperand(0);
       else
         return SDValue();
     }
     Opc = Op.getOpcode();
   }
 
   // Check if the first inserted node zeroes the upper bits, or an 'and' result
   // of a node that zeros the upper bits (its masked version).
   if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
       (Op.getOpcode() == ISD::AND &&
        (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
         isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
     return Op;
   }
 
   return SDValue();
 }
 
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
   SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
   unsigned NumOperands = Op.getNumOperands();
 
   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
   // If this node promotes - by concatenating zeroes - the type of the result
   // of a node with instruction that zeroes all upper (irrelevant) bits of the
   // output register, mark it as legal and catch the pattern in instruction
   // selection to avoid emitting extra instructions (for zeroing upper bits).
   if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
     SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
     SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
                        ZeroC);
   }
 
   unsigned NumZero = 0;
   unsigned NumNonZero = 0;
   uint64_t NonZeros = 0;
   for (unsigned i = 0; i != NumOperands; ++i) {
     SDValue SubVec = Op.getOperand(i);
     if (SubVec.isUndef())
       continue;
     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
       ++NumZero;
     else {
       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
       NonZeros |= (uint64_t)1 << i;
       ++NumNonZero;
     }
   }
 
 
   // If there are zero or one non-zeros we can handle this very simply.
   if (NumNonZero <= 1) {
     SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
                           : DAG.getUNDEF(ResVT);
     if (!NumNonZero)
       return Vec;
     unsigned Idx = countTrailingZeros(NonZeros);
     SDValue SubVec = Op.getOperand(Idx);
     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
   }
 
   if (NumOperands > 2) {
     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                   ResVT.getVectorNumElements()/2);
     ArrayRef<SDUse> Ops = Op->ops();
     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
                              Ops.slice(0, NumOperands/2));
     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
                              Ops.slice(NumOperands/2));
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   }
 
   assert(NumNonZero == 2 && "Simple cases not handled?");
 
   if (ResVT.getVectorNumElements() >= 16)
     return Op; // The operation is legal with KUNPCK
 
   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
                             DAG.getUNDEF(ResVT), Op.getOperand(0),
                             DAG.getIntPtrConstant(0, dl));
   unsigned NumElems = ResVT.getVectorNumElements();
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
                      DAG.getIntPtrConstant(NumElems/2, dl));
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT.getVectorElementType() == MVT::i1)
     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
 
   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
           Op.getNumOperands() == 4)));
 
   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
 
   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
   return LowerAVXCONCAT_VECTORS(Op, DAG);
 }
 
 //===----------------------------------------------------------------------===//
 // Vector shuffle lowering
 //
 // This is an experimental code path for lowering vector shuffles on x86. It is
 // designed to handle arbitrary vector shuffles and blends, gracefully
 // degrading performance as necessary. It works hard to recognize idiomatic
 // shuffles and lower them to optimal instruction patterns without leaving
 // a framework that allows reasonably efficient handling of all vector shuffle
 // patterns.
 //===----------------------------------------------------------------------===//
 
 /// \brief Tiny helper function to identify a no-op mask.
 ///
 /// This is a somewhat boring predicate function. It checks whether the mask
 /// array input, which is assumed to be a single-input shuffle mask of the kind
 /// used by the X86 shuffle instructions (not a fully general
 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
 /// in-place shuffle are 'no-op's.
 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     assert(Mask[i] >= -1 && "Out of bound mask element!");
     if (Mask[i] >= 0 && Mask[i] != i)
       return false;
   }
   return true;
 }
 
 /// \brief Test whether there are elements crossing 128-bit lanes in this
 /// shuffle mask.
 ///
 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
 /// and we routinely test for these.
 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   int LaneSize = 128 / VT.getScalarSizeInBits();
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
       return true;
   return false;
 }
 
 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
 ///
 /// This checks a shuffle mask to see if it is performing the same
 /// lane-relative shuffle in each sub-lane. This trivially implies
 /// that it is also not lane-crossing. It may however involve a blend from the
 /// same lane of a second vector.
 ///
 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
 /// non-trivial to compute in the face of undef lanes. The representation is
 /// suitable for use with existing 128-bit shuffles as entries from the second
 /// vector have been remapped to [LaneSize, 2*LaneSize).
 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
                                   ArrayRef<int> Mask,
                                   SmallVectorImpl<int> &RepeatedMask) {
   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   RepeatedMask.assign(LaneSize, -1);
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i) {
     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
     if (Mask[i] < 0)
       continue;
     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
       // This entry crosses lanes, so there is no way to model this shuffle.
       return false;
 
     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
     // Adjust second vector indices to start at LaneSize instead of Size.
     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
                                 : Mask[i] % LaneSize + LaneSize;
     if (RepeatedMask[i % LaneSize] < 0)
       // This is the first non-undef entry in this slot of a 128-bit lane.
       RepeatedMask[i % LaneSize] = LocalM;
     else if (RepeatedMask[i % LaneSize] != LocalM)
       // Found a mismatch with the repeated mask.
       return false;
   }
   return true;
 }
 
 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
 static bool
 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
                                 SmallVectorImpl<int> &RepeatedMask) {
   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
 }
 
 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
 static bool
 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
                                 SmallVectorImpl<int> &RepeatedMask) {
   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
 }
 
 /// Test whether a target shuffle mask is equivalent within each sub-lane.
 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
                                         ArrayRef<int> Mask,
                                         SmallVectorImpl<int> &RepeatedMask) {
   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i) {
     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
     if (Mask[i] == SM_SentinelUndef)
       continue;
     if (Mask[i] == SM_SentinelZero) {
       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
         return false;
       RepeatedMask[i % LaneSize] = SM_SentinelZero;
       continue;
     }
     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
       // This entry crosses lanes, so there is no way to model this shuffle.
       return false;
 
     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
     // Adjust second vector indices to start at LaneSize instead of Size.
     int LocalM =
         Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
       // This is the first non-undef entry in this slot of a 128-bit lane.
       RepeatedMask[i % LaneSize] = LocalM;
     else if (RepeatedMask[i % LaneSize] != LocalM)
       // Found a mismatch with the repeated mask.
       return false;
   }
   return true;
 }
 
 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
 /// arguments.
 ///
 /// This is a fast way to test a shuffle mask against a fixed pattern:
 ///
 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
 ///
 /// It returns true if the mask is exactly as wide as the argument list, and
 /// each element of the mask is either -1 (signifying undef) or the value given
 /// in the argument.
 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
                                 ArrayRef<int> ExpectedMask) {
   if (Mask.size() != ExpectedMask.size())
     return false;
 
   int Size = Mask.size();
 
   // If the values are build vectors, we can look through them to find
   // equivalent inputs that make the shuffles equivalent.
   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
 
   for (int i = 0; i < Size; ++i) {
     assert(Mask[i] >= -1 && "Out of bound mask element!");
     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
       if (!MaskBV || !ExpectedBV ||
           MaskBV->getOperand(Mask[i] % Size) !=
               ExpectedBV->getOperand(ExpectedMask[i] % Size))
         return false;
     }
   }
 
   return true;
 }
 
 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
 ///
 /// The masks must be exactly the same width.
 ///
 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
 ///
 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
                                       ArrayRef<int> ExpectedMask) {
   int Size = Mask.size();
   if (Size != (int)ExpectedMask.size())
     return false;
 
   for (int i = 0; i < Size; ++i)
     if (Mask[i] == SM_SentinelUndef)
       continue;
     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
       return false;
     else if (Mask[i] != ExpectedMask[i])
       return false;
 
   return true;
 }
 
 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
 // mask.
 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
                                                     const APInt &Zeroable) {
   int NumElts = Mask.size();
   assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
 
   SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
   for (int i = 0; i != NumElts; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef)
       continue;
     assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
     TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
   }
   return TargetMask;
 }
 
 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
 // instructions.
 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
   if (VT != MVT::v8i32 && VT != MVT::v8f32)
     return false;
 
   SmallVector<int, 8> Unpcklwd;
   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
                           /* Unary = */ false);
   SmallVector<int, 8> Unpckhwd;
   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
                           /* Unary = */ false);
   bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
                          isTargetShuffleEquivalent(Mask, Unpckhwd));
   return IsUnpackwdMask;
 }
 
 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
 /// example.
 ///
 /// NB: We rely heavily on "undef" masks preserving the input lane.
 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
 
   unsigned Imm = 0;
   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
   return Imm;
 }
 
 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
                                           SelectionDAG &DAG) {
   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
 
 /// \brief Compute whether each element of a shuffle is zeroable.
 ///
 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
 /// Either it is an undef element in the shuffle mask, the element of the input
 /// referenced is undef, or the element of the input referenced is known to be
 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
 /// as many lanes with this technique as possible to simplify the remaining
 /// shuffle.
 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
                                             SDValue V1, SDValue V2) {
   APInt Zeroable(Mask.size(), 0);
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
 
   int VectorSizeInBits = V1.getValueSizeInBits();
   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
 
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     int M = Mask[i];
     // Handle the easy cases.
     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
       Zeroable.setBit(i);
       continue;
     }
 
     // Determine shuffle input and normalize the mask.
     SDValue V = M < Size ? V1 : V2;
     M %= Size;
 
     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
     if (V.getOpcode() != ISD::BUILD_VECTOR)
       continue;
 
     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
     // the (larger) source element must be UNDEF/ZERO.
     if ((Size % V.getNumOperands()) == 0) {
       int Scale = Size / V->getNumOperands();
       SDValue Op = V.getOperand(M / Scale);
       if (Op.isUndef() || X86::isZeroNode(Op))
         Zeroable.setBit(i);
       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
         APInt Val = Cst->getAPIntValue();
         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
         if (Val == 0)
           Zeroable.setBit(i);
       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
         APInt Val = Cst->getValueAPF().bitcastToAPInt();
         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
         if (Val == 0)
           Zeroable.setBit(i);
       }
       continue;
     }
 
     // If the BUILD_VECTOR has more elements then all the (smaller) source
     // elements must be UNDEF or ZERO.
     if ((V.getNumOperands() % Size) == 0) {
       int Scale = V->getNumOperands() / Size;
       bool AllZeroable = true;
       for (int j = 0; j < Scale; ++j) {
         SDValue Op = V.getOperand((M * Scale) + j);
         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
       }
       if (AllZeroable)
         Zeroable.setBit(i);
       continue;
     }
   }
 
   return Zeroable;
 }
 
 // The Shuffle result is as follow:
 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
 // Each Zeroable's element correspond to a particular Mask's element.
 // As described in computeZeroableShuffleElements function.
 //
 // The function looks for a sub-mask that the nonzero elements are in
 // increasing order. If such sub-mask exist. The function returns true.
 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
                                      ArrayRef<int> Mask, const EVT &VectorType,
                                      bool &IsZeroSideLeft) {
   int NextElement = -1;
   // Check if the Mask's nonzero elements are in increasing order.
   for (int i = 0, e = Mask.size(); i < e; i++) {
     // Checks if the mask's zeros elements are built from only zeros.
     assert(Mask[i] >= -1 && "Out of bound mask element!");
     if (Mask[i] < 0)
       return false;
     if (Zeroable[i])
       continue;
     // Find the lowest non zero element
     if (NextElement < 0) {
       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
       IsZeroSideLeft = NextElement != 0;
     }
     // Exit if the mask's non zero elements are not in increasing order.
     if (NextElement != Mask[i])
       return false;
     NextElement++;
   }
   return true;
 }
 
 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2,
                                             const APInt &Zeroable,
                                             const X86Subtarget &Subtarget,
                                             SelectionDAG &DAG) {
   int Size = Mask.size();
   int LaneSize = 128 / VT.getScalarSizeInBits();
   const int NumBytes = VT.getSizeInBits() / 8;
   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
 
   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
          (Subtarget.hasBWI() && VT.is512BitVector()));
 
   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
   // Sign bit set in i8 mask means zero element.
   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
 
   SDValue V;
   for (int i = 0; i < NumBytes; ++i) {
     int M = Mask[i / NumEltBytes];
     if (M < 0) {
       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
       continue;
     }
     if (Zeroable[i / NumEltBytes]) {
       PSHUFBMask[i] = ZeroMask;
       continue;
     }
 
     // We can only use a single input of V1 or V2.
     SDValue SrcV = (M >= Size ? V2 : V1);
     if (V && V != SrcV)
       return SDValue();
     V = SrcV;
     M %= Size;
 
     // PSHUFB can't cross lanes, ensure this doesn't happen.
     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
       return SDValue();
 
     M = M % LaneSize;
     M = M * NumEltBytes + (i % NumEltBytes);
     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
   }
   assert(V && "Failed to find a source input");
 
   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
   return DAG.getBitcast(
       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
 }
 
 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
                            const SDLoc &dl);
 
 // X86 has dedicated shuffle that can be lowered to VEXPAND
 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
                                           const APInt &Zeroable,
                                           ArrayRef<int> Mask, SDValue &V1,
                                           SDValue &V2, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget) {
   bool IsLeftZeroSide = true;
   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
                                 IsLeftZeroSide))
     return SDValue();
   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
   MVT IntegerType =
       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
   unsigned NumElts = VT.getVectorNumElements();
   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
          "Unexpected number of vector elements");
   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
                               Subtarget, DAG, DL);
   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
   return DAG.getSelect(DL, VT, VMask,
                        DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
                        ZeroVector);
 }
 
 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
                                         unsigned &UnpackOpcode, bool IsUnary,
                                         ArrayRef<int> TargetMask, SDLoc &DL,
                                         SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   int NumElts = VT.getVectorNumElements();
 
   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
   for (int i = 0; i != NumElts; i += 2) {
     int M1 = TargetMask[i + 0];
     int M2 = TargetMask[i + 1];
     Undef1 &= (SM_SentinelUndef == M1);
     Undef2 &= (SM_SentinelUndef == M2);
     Zero1 &= isUndefOrZero(M1);
     Zero2 &= isUndefOrZero(M2);
   }
   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
          "Zeroable shuffle detected");
 
   // Attempt to match the target mask against the unpack lo/hi mask patterns.
   SmallVector<int, 64> Unpckl, Unpckh;
   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
   if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
     UnpackOpcode = X86ISD::UNPCKL;
     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
     return true;
   }
 
   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
   if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
     UnpackOpcode = X86ISD::UNPCKH;
     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
     return true;
   }
 
   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
   if (IsUnary && (Zero1 || Zero2)) {
     // Don't bother if we can blend instead.
     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
       return false;
 
     bool MatchLo = true, MatchHi = true;
     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
       int M = TargetMask[i];
 
       // Ignore if the input is known to be zero or the index is undef.
       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
           (M == SM_SentinelUndef))
         continue;
 
       MatchLo &= (M == Unpckl[i]);
       MatchHi &= (M == Unpckh[i]);
     }
 
     if (MatchLo || MatchHi) {
       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
       return true;
     }
   }
 
   // If a binary shuffle, commute and try again.
   if (!IsUnary) {
     ShuffleVectorSDNode::commuteMask(Unpckl);
     if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
       UnpackOpcode = X86ISD::UNPCKL;
       std::swap(V1, V2);
       return true;
     }
 
     ShuffleVectorSDNode::commuteMask(Unpckh);
     if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
       UnpackOpcode = X86ISD::UNPCKH;
       std::swap(V1, V2);
       return true;
     }
   }
 
   return false;
 }
 
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
                                            ArrayRef<int> Mask, SDValue V1,
                                            SDValue V2, SelectionDAG &DAG) {
   SmallVector<int, 8> Unpckl;
   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
 
   SmallVector<int, 8> Unpckh;
   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
 
   // Commute and try again.
   ShuffleVectorSDNode::commuteMask(Unpckl);
   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
 
   ShuffleVectorSDNode::commuteMask(Unpckh);
   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
 
   return SDValue();
 }
 
 // X86 has dedicated pack instructions that can handle specific truncation
 // operations: PACKSS and PACKUS.
 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
                                        SDValue &V2, unsigned &PackOpcode,
                                        ArrayRef<int> TargetMask,
                                        SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget) {
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BitSize = VT.getScalarSizeInBits();
   MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
   MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
 
   auto MatchPACK = [&](SDValue N1, SDValue N2) {
     SDValue VV1 = DAG.getBitcast(PackVT, N1);
     SDValue VV2 = DAG.getBitcast(PackVT, N2);
     if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
         (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
       V1 = VV1;
       V2 = VV2;
       SrcVT = PackVT;
       PackOpcode = X86ISD::PACKSS;
       return true;
     }
 
     if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
       APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
       if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
           (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
         V1 = VV1;
         V2 = VV2;
         SrcVT = PackVT;
         PackOpcode = X86ISD::PACKUS;
         return true;
       }
     }
 
     return false;
   };
 
   // Try binary shuffle.
   SmallVector<int, 32> BinaryMask;
   createPackShuffleMask(VT, BinaryMask, false);
   if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
     if (MatchPACK(V1, V2))
       return true;
 
   // Try unary shuffle.
   SmallVector<int, 32> UnaryMask;
   createPackShuffleMask(VT, UnaryMask, true);
   if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
     if (MatchPACK(V1, V1))
       return true;
 
   return false;
 }
 
 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
                                           ArrayRef<int> Mask, SDValue V1,
                                           SDValue V2, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget) {
   MVT PackVT;
   unsigned PackOpcode;
   if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
                                  Subtarget))
     return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
                        DAG.getBitcast(PackVT, V2));
 
   return SDValue();
 }
 
 /// \brief Try to emit a bitmask instruction for a shuffle.
 ///
 /// This handles cases where we can model a blend exactly as a bitmask due to
 /// one of the inputs being zeroable.
 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
                                            const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
   MVT EltVT = VT.getVectorElementType();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   SDValue V;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Zeroable[i])
       continue;
     if (Mask[i] % Size != i)
       return SDValue(); // Not a blend.
     if (!V)
       V = Mask[i] < Size ? V1 : V2;
     else if (V != (Mask[i] < Size ? V1 : V2))
       return SDValue(); // Can only let one input through the mask.
 
     VMaskOps[i] = AllOnes;
   }
   if (!V)
     return SDValue(); // No non-zeroable elements!
 
   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
   return DAG.getNode(ISD::AND, DL, VT, V, VMask);
 }
 
 /// \brief Try to emit a blend instruction for a shuffle using bit math.
 ///
 /// This is used as a fallback approach when first class blend instructions are
 /// unavailable. Currently it is only suitable for integer vectors, but could
 /// be generalized for floating point vectors if desirable.
 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
                                             SelectionDAG &DAG) {
   assert(VT.isInteger() && "Only supports integer vector types!");
   MVT EltVT = VT.getVectorElementType();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> MaskOps;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
       return SDValue(); // Shuffled input!
     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
   }
 
   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
   // We have to cast V2 around.
   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
                                       DAG.getBitcast(MaskVT, V1Mask),
                                       DAG.getBitcast(MaskVT, V2)));
   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
 }
 
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG);
 
 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
                                       MutableArrayRef<int> TargetMask,
                                       bool &ForceV1Zero, bool &ForceV2Zero,
                                       uint64_t &BlendMask) {
   bool V1IsZeroOrUndef =
       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
   bool V2IsZeroOrUndef =
       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
 
   BlendMask = 0;
   ForceV1Zero = false, ForceV2Zero = false;
   assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
 
   // Attempt to generate the binary blend mask. If an input is zero then
   // we can use any lane.
   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
   for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
     int M = TargetMask[i];
     if (M == SM_SentinelUndef)
       continue;
     if (M == i)
       continue;
     if (M == i + Size) {
       BlendMask |= 1ull << i;
       continue;
     }
     if (M == SM_SentinelZero) {
       if (V1IsZeroOrUndef) {
         ForceV1Zero = true;
         TargetMask[i] = i;
         continue;
       }
       if (V2IsZeroOrUndef) {
         ForceV2Zero = true;
         BlendMask |= 1ull << i;
         TargetMask[i] = i + Size;
         continue;
       }
     }
     return false;
   }
   return true;
 }
 
 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
                                             int Scale) {
   uint64_t ScaledMask = 0;
   for (int i = 0; i != Size; ++i)
     if (BlendMask & (1ull << i))
       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
   return ScaledMask;
 }
 
 /// \brief Try to emit a blend instruction for a shuffle.
 ///
 /// This doesn't do any checks for the availability of instructions for blending
 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
 /// be matched in the backend with the type given. What it does check for is
 /// that the shuffle mask is a blend, or convertible into a blend with zero.
 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Original,
                                          const APInt &Zeroable,
                                          const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
 
   uint64_t BlendMask = 0;
   bool ForceV1Zero = false, ForceV2Zero = false;
   if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
                                  BlendMask))
     return SDValue();
 
   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
   if (ForceV1Zero)
     V1 = getZeroVector(VT, Subtarget, DAG, DL);
   if (ForceV2Zero)
     V2 = getZeroVector(VT, Subtarget, DAG, DL);
 
   switch (VT.SimpleTy) {
   case MVT::v2f64:
   case MVT::v4f32:
   case MVT::v4f64:
   case MVT::v8f32:
     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
                        DAG.getConstant(BlendMask, DL, MVT::i8));
 
   case MVT::v4i64:
   case MVT::v8i32:
     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
     LLVM_FALLTHROUGH;
   case MVT::v2i64:
   case MVT::v4i32:
     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
     // that instruction.
     if (Subtarget.hasAVX2()) {
       // Scale the blend by the number of 32-bit dwords per element.
       int Scale =  VT.getScalarSizeInBits() / 32;
       BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
       V1 = DAG.getBitcast(BlendVT, V1);
       V2 = DAG.getBitcast(BlendVT, V2);
       return DAG.getBitcast(
           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
                           DAG.getConstant(BlendMask, DL, MVT::i8)));
     }
     LLVM_FALLTHROUGH;
   case MVT::v8i16: {
     // For integer shuffles we need to expand the mask and cast the inputs to
     // v8i16s prior to blending.
     int Scale = 8 / VT.getVectorNumElements();
     BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
     V1 = DAG.getBitcast(MVT::v8i16, V1);
     V2 = DAG.getBitcast(MVT::v8i16, V2);
     return DAG.getBitcast(VT,
                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
   }
 
   case MVT::v16i16: {
     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
       BlendMask = 0;
       for (int i = 0; i < 8; ++i)
         if (RepeatedMask[i] >= 8)
           BlendMask |= 1ull << i;
       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
                          DAG.getConstant(BlendMask, DL, MVT::i8));
     }
     LLVM_FALLTHROUGH;
   }
   case MVT::v16i8:
   case MVT::v32i8: {
     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
            "256-bit byte-blends require AVX2 support!");
 
     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
       MVT IntegerType =
           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
     }
 
     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
     if (SDValue Masked =
             lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
       return Masked;
 
     // Scale the blend by the number of bytes per element.
     int Scale = VT.getScalarSizeInBits() / 8;
 
     // This form of blend is always done on bytes. Compute the byte vector
     // type.
     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
 
     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
     // mix of LLVM's code generator and the x86 backend. We tell the code
     // generator that boolean values in the elements of an x86 vector register
     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
     // mapping a select to operand #1, and 'false' mapping to operand #2. The
     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
     // of the element (the remaining are ignored) and 0 in that high bit would
     // mean operand #1 while 1 in the high bit would mean operand #2. So while
     // the LLVM model for boolean values in vector elements gets the relevant
     // bit set, it is set backwards and over constrained relative to x86's
     // actual model.
     SmallVector<SDValue, 32> VSELECTMask;
     for (int i = 0, Size = Mask.size(); i < Size; ++i)
       for (int j = 0; j < Scale; ++j)
         VSELECTMask.push_back(
             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
                                           MVT::i8));
 
     V1 = DAG.getBitcast(BlendVT, V1);
     V2 = DAG.getBitcast(BlendVT, V2);
     return DAG.getBitcast(
         VT,
         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
                       V1, V2));
   }
   case MVT::v16f32:
   case MVT::v8f64:
   case MVT::v8i64:
   case MVT::v16i32:
   case MVT::v32i16:
   case MVT::v64i8: {
     MVT IntegerType =
         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
   }
   default:
     llvm_unreachable("Not a supported integer vector type!");
   }
 }
 
 /// \brief Try to lower as a blend of elements from two inputs followed by
 /// a single-input permutation.
 ///
 /// This matches the pattern where we can blend elements from two inputs and
 /// then reduce the shuffle to a single-input permutation.
 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
                                                    SDValue V1, SDValue V2,
                                                    ArrayRef<int> Mask,
                                                    SelectionDAG &DAG) {
   // We build up the blend mask while checking whether a blend is a viable way
   // to reduce the shuffle.
   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
 
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Mask[i] < 0)
       continue;
 
     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
 
     if (BlendMask[Mask[i] % Size] < 0)
       BlendMask[Mask[i] % Size] = Mask[i];
     else if (BlendMask[Mask[i] % Size] != Mask[i])
       return SDValue(); // Can't blend in the needed input!
 
     PermuteMask[i] = Mask[i] % Size;
   }
 
   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
 }
 
 /// \brief Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
 /// This matches the extremely common pattern for handling combined
 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
 /// operations. It will try to pick the best arrangement of shuffles and
 /// blends.
 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
                                                           MVT VT, SDValue V1,
                                                           SDValue V2,
                                                           ArrayRef<int> Mask,
                                                           SelectionDAG &DAG) {
   // Shuffle the input elements into the desired positions in V1 and V2 and
   // blend them together.
   SmallVector<int, 32> V1Mask(Mask.size(), -1);
   SmallVector<int, 32> V2Mask(Mask.size(), -1);
   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   for (int i = 0, Size = Mask.size(); i < Size; ++i)
     if (Mask[i] >= 0 && Mask[i] < Size) {
       V1Mask[i] = Mask[i];
       BlendMask[i] = i;
     } else if (Mask[i] >= Size) {
       V2Mask[i] = Mask[i] - Size;
       BlendMask[i] = i + Size;
     }
 
   // Try to lower with the simpler initial blend strategy unless one of the
   // input shuffles would be a no-op. We prefer to shuffle inputs as the
   // shuffle may be able to fold with a load or other benefit. However, when
   // we'll have to do 2x as many shuffles in order to achieve this, blending
   // first is a better strategy.
   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
     if (SDValue BlendPerm =
             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
       return BlendPerm;
 
   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
 }
 
 /// \brief Try to lower a vector shuffle as a rotation.
 ///
 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
                                       ArrayRef<int> Mask) {
   int NumElts = Mask.size();
 
   // We need to detect various ways of spelling a rotation:
   //   [11, 12, 13, 14, 15,  0,  1,  2]
   //   [-1, 12, 13, 14, -1, -1,  1, -1]
   //   [-1, -1, -1, -1, -1, -1,  1,  2]
   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
   //   [-1,  4,  5,  6, -1, -1,  9, -1]
   //   [-1,  4,  5,  6, -1, -1, -1, -1]
   int Rotation = 0;
   SDValue Lo, Hi;
   for (int i = 0; i < NumElts; ++i) {
     int M = Mask[i];
     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
            "Unexpected mask index.");
     if (M < 0)
       continue;
 
     // Determine where a rotated vector would have started.
     int StartIdx = i - (M % NumElts);
     if (StartIdx == 0)
       // The identity rotation isn't interesting, stop.
       return -1;
 
     // If we found the tail of a vector the rotation must be the missing
     // front. If we found the head of a vector, it must be how much of the
     // head.
     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
 
     if (Rotation == 0)
       Rotation = CandidateRotation;
     else if (Rotation != CandidateRotation)
       // The rotations don't match, so we can't match this mask.
       return -1;
 
     // Compute which value this mask is pointing at.
     SDValue MaskV = M < NumElts ? V1 : V2;
 
     // Compute which of the two target values this index should be assigned
     // to. This reflects whether the high elements are remaining or the low
     // elements are remaining.
     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
 
     // Either set up this value if we've not encountered it before, or check
     // that it remains consistent.
     if (!TargetV)
       TargetV = MaskV;
     else if (TargetV != MaskV)
       // This may be a rotation, but it pulls from the inputs in some
       // unsupported interleaving.
       return -1;
   }
 
   // Check that we successfully analyzed the mask, and normalize the results.
   assert(Rotation != 0 && "Failed to locate a viable rotation!");
   assert((Lo || Hi) && "Failed to find a rotated input vector!");
   if (!Lo)
     Lo = Hi;
   else if (!Hi)
     Hi = Lo;
 
   V1 = Lo;
   V2 = Hi;
 
   return Rotation;
 }
 
 /// \brief Try to lower a vector shuffle as a byte rotation.
 ///
 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
 /// try to generically lower a vector shuffle through such an pattern. It
 /// does not check for the profitability of lowering either as PALIGNR or
 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
 /// This matches shuffle vectors that look like:
 ///
 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
 ///
 /// Essentially it concatenates V1 and V2, shifts right by some number of
 /// elements, and takes the low elements as the result. Note that while this is
 /// specified as a *right shift* because x86 is little-endian, it is a *left
 /// rotate* of the vector lanes.
 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
                                           ArrayRef<int> Mask) {
   // Don't accept any shuffles with zero elements.
   if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
     return -1;
 
   // PALIGNR works on 128-bit lanes.
   SmallVector<int, 16> RepeatedMask;
   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
     return -1;
 
   int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
   if (Rotation <= 0)
     return -1;
 
   // PALIGNR rotates bytes, so we need to scale the
   // rotation based on how many bytes are in the vector lane.
   int NumElts = RepeatedMask.size();
   int Scale = 16 / NumElts;
   return Rotation * Scale;
 }
 
 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
                                               SDValue V1, SDValue V2,
                                               ArrayRef<int> Mask,
                                               const X86Subtarget &Subtarget,
                                               SelectionDAG &DAG) {
   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
 
   SDValue Lo = V1, Hi = V2;
   int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
   if (ByteRotation <= 0)
     return SDValue();
 
   // Cast the inputs to i8 vector of correct length to match PALIGNR or
   // PSLLDQ/PSRLDQ.
   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
   Lo = DAG.getBitcast(ByteVT, Lo);
   Hi = DAG.getBitcast(ByteVT, Hi);
 
   // SSSE3 targets can use the palignr instruction.
   if (Subtarget.hasSSSE3()) {
     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
            "512-bit PALIGNR requires BWI instructions");
     return DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
                         DAG.getConstant(ByteRotation, DL, MVT::i8)));
   }
 
   assert(VT.is128BitVector() &&
          "Rotate-based lowering only supports 128-bit lowering!");
   assert(Mask.size() <= 16 &&
          "Can shuffle at most 16 bytes in a 128-bit vector!");
   assert(ByteVT == MVT::v16i8 &&
          "SSE2 rotate lowering only needed for v16i8!");
 
   // Default SSE2 implementation
   int LoByteShift = 16 - ByteRotation;
   int HiByteShift = ByteRotation;
 
   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
   return DAG.getBitcast(VT,
                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
 }
 
 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
 ///
 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
 /// rotation of the concatenation of two vectors; This routine will
 /// try to generically lower a vector shuffle through such an pattern.
 ///
 /// Essentially it concatenates V1 and V2, shifts right by some number of
 /// elements, and takes the low elements as the result. Note that while this is
 /// specified as a *right shift* because x86 is little-endian, it is a *left
 /// rotate* of the vector lanes.
 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
                                           SDValue V1, SDValue V2,
                                           ArrayRef<int> Mask,
                                           const X86Subtarget &Subtarget,
                                           SelectionDAG &DAG) {
   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
          "Only 32-bit and 64-bit elements are supported!");
 
   // 128/256-bit vectors are only supported with VLX.
   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
          && "VLX required for 128/256-bit vectors");
 
   SDValue Lo = V1, Hi = V2;
   int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
   if (Rotation <= 0)
     return SDValue();
 
   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
                      DAG.getConstant(Rotation, DL, MVT::i8));
 }
 
 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
 ///
 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
 /// matches elements from one of the input vectors shuffled to the left or
 /// right with zeroable elements 'shifted in'. It handles both the strictly
 /// bit-wise element shifts and the byte shift across an entire 128-bit double
 /// quad word lane.
 ///
 /// PSHL : (little-endian) left bit shift.
 /// [ zz, 0, zz,  2 ]
 /// [ -1, 4, zz, -1 ]
 /// PSRL : (little-endian) right bit shift.
 /// [  1, zz,  3, zz]
 /// [ -1, -1,  7, zz]
 /// PSLLDQ : (little-endian) left byte shift
 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
 /// PSRLDQ : (little-endian) right byte shift
 /// [  5, 6,  7, zz, zz, zz, zz, zz]
 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
 /// [  1, 2, -1, -1, -1, -1, zz, zz]
 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
                                      unsigned ScalarSizeInBits,
                                      ArrayRef<int> Mask, int MaskOffset,
                                      const APInt &Zeroable,
                                      const X86Subtarget &Subtarget) {
   int Size = Mask.size();
   unsigned SizeInBits = Size * ScalarSizeInBits;
 
   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
     for (int i = 0; i < Size; i += Scale)
       for (int j = 0; j < Shift; ++j)
         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
           return false;
 
     return true;
   };
 
   auto MatchShift = [&](int Shift, int Scale, bool Left) {
     for (int i = 0; i != Size; i += Scale) {
       unsigned Pos = Left ? i + Shift : i;
       unsigned Low = Left ? i : i + Shift;
       unsigned Len = Scale - Shift;
       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
         return -1;
     }
 
     int ShiftEltBits = ScalarSizeInBits * Scale;
     bool ByteShift = ShiftEltBits > 64;
     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
 
     // Normalize the scale for byte shifts to still produce an i64 element
     // type.
     Scale = ByteShift ? Scale / 2 : Scale;
 
     // We need to round trip through the appropriate type for the shift.
     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
     return (int)ShiftAmt;
   };
 
   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
   // keep doubling the size of the integer elements up to that. We can
   // then shift the elements of the integer vector by whole multiples of
   // their width within the elements of the larger integer vector. Test each
   // multiple to see if we can find a match with the moved element indices
   // and that the shifted in elements are all zeroable.
   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
     for (int Shift = 1; Shift != Scale; ++Shift)
       for (bool Left : {true, false})
         if (CheckZeros(Shift, Scale, Left)) {
           int ShiftAmt = MatchShift(Shift, Scale, Left);
           if (0 < ShiftAmt)
             return ShiftAmt;
         }
 
   // no match
   return -1;
 }
 
 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
                                          const APInt &Zeroable,
                                          const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   int Size = Mask.size();
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
 
   MVT ShiftVT;
   SDValue V = V1;
   unsigned Opcode;
 
   // Try to match shuffle against V1 shift.
   int ShiftAmt = matchVectorShuffleAsShift(
       ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
 
   // If V1 failed, try to match shuffle against V2 shift.
   if (ShiftAmt < 0) {
     ShiftAmt =
         matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
                                   Mask, Size, Zeroable, Subtarget);
     V = V2;
   }
 
   if (ShiftAmt < 0)
     return SDValue();
 
   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
          "Illegal integer vector type");
   V = DAG.getBitcast(ShiftVT, V);
   V = DAG.getNode(Opcode, DL, ShiftVT, V,
                   DAG.getConstant(ShiftAmt, DL, MVT::i8));
   return DAG.getBitcast(VT, V);
 }
 
 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
 // Remainder of lower half result is zero and upper half is all undef.
 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
                                       ArrayRef<int> Mask, uint64_t &BitLen,
                                       uint64_t &BitIdx, const APInt &Zeroable) {
   int Size = Mask.size();
   int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
 
   // Upper half must be undefined.
   if (!isUndefInRange(Mask, HalfSize, HalfSize))
     return false;
 
   // Determine the extraction length from the part of the
   // lower half that isn't zeroable.
   int Len = HalfSize;
   for (; Len > 0; --Len)
     if (!Zeroable[Len - 1])
       break;
   assert(Len > 0 && "Zeroable shuffle mask");
 
   // Attempt to match first Len sequential elements from the lower half.
   SDValue Src;
   int Idx = -1;
   for (int i = 0; i != Len; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef)
       continue;
     SDValue &V = (M < Size ? V1 : V2);
     M = M % Size;
 
     // The extracted elements must start at a valid index and all mask
     // elements must be in the lower half.
     if (i > M || M >= HalfSize)
       return false;
 
     if (Idx < 0 || (Src == V && Idx == (M - i))) {
       Src = V;
       Idx = M - i;
       continue;
     }
     return false;
   }
 
   if (!Src || Idx < 0)
     return false;
 
   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
   V1 = Src;
   return true;
 }
 
 // INSERTQ: Extract lowest Len elements from lower half of second source and
 // insert over first source, starting at Idx.
 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
                                         ArrayRef<int> Mask, uint64_t &BitLen,
                                         uint64_t &BitIdx) {
   int Size = Mask.size();
   int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
 
   // Upper half must be undefined.
   if (!isUndefInRange(Mask, HalfSize, HalfSize))
     return false;
 
   for (int Idx = 0; Idx != HalfSize; ++Idx) {
     SDValue Base;
 
     // Attempt to match first source from mask before insertion point.
     if (isUndefInRange(Mask, 0, Idx)) {
       /* EMPTY */
     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
       Base = V1;
     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
       Base = V2;
     } else {
       continue;
     }
 
     // Extend the extraction length looking to match both the insertion of
     // the second source and the remaining elements of the first.
     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
       SDValue Insert;
       int Len = Hi - Idx;
 
       // Match insertion.
       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
         Insert = V1;
       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
         Insert = V2;
       } else {
         continue;
       }
 
       // Match the remaining elements of the lower half.
       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
         /* EMPTY */
       } else if ((!Base || (Base == V1)) &&
                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
         Base = V1;
       } else if ((!Base || (Base == V2)) &&
                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
                                             Size + Hi)) {
         Base = V2;
       } else {
         continue;
       }
 
       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
       V1 = Base;
       V2 = Insert;
       return true;
     }
   }
 
   return false;
 }
 
 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
                                            const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   uint64_t BitLen, BitIdx;
   if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
                        DAG.getConstant(BitLen, DL, MVT::i8),
                        DAG.getConstant(BitIdx, DL, MVT::i8));
 
   if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
                        V2 ? V2 : DAG.getUNDEF(VT),
                        DAG.getConstant(BitLen, DL, MVT::i8),
                        DAG.getConstant(BitIdx, DL, MVT::i8));
 
   return SDValue();
 }
 
 /// \brief Lower a vector shuffle as a zero or any extension.
 ///
 /// Given a specific number of elements, element bit width, and extension
 /// stride, produce either a zero or any extension based on the available
 /// features of the subtarget. The extended elements are consecutive and
 /// begin and can start from an offsetted element index in the input; to
 /// avoid excess shuffling the offset must either being in the bottom lane
 /// or at the start of a higher lane. All extended elements must be from
 /// the same lane.
 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(Scale > 1 && "Need a scale to extend.");
   int EltBits = VT.getScalarSizeInBits();
   int NumElements = VT.getVectorNumElements();
   int NumEltsPerLane = 128 / EltBits;
   int OffsetLane = Offset / NumEltsPerLane;
   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
          "Only 8, 16, and 32 bit elements can be extended.");
   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
   assert(0 <= Offset && "Extension offset must be positive.");
   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
          "Extension offset must be in the first lane or start an upper lane.");
 
   // Check that an index is in same lane as the base offset.
   auto SafeOffset = [&](int Idx) {
     return OffsetLane == (Idx / NumEltsPerLane);
   };
 
   // Shift along an input so that the offset base moves to the first element.
   auto ShuffleOffset = [&](SDValue V) {
     if (!Offset)
       return V;
 
     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
     for (int i = 0; i * Scale < NumElements; ++i) {
       int SrcIdx = i + Offset;
       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
     }
     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
   };
 
   // Found a valid zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
   if (Subtarget.hasSSE41()) {
     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
     // PUNPCK will catch this in a later shuffle match.
     if (Offset && Scale == 2 && VT.is128BitVector())
       return SDValue();
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
     InputV = ShuffleOffset(InputV);
     InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
     return DAG.getBitcast(VT, InputV);
   }
 
   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
 
   // For any extends we can cheat for larger element sizes and use shuffle
   // instructions that can fold with a load and/or copy.
   if (AnyExt && EltBits == 32) {
     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
                          -1};
     return DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                         DAG.getBitcast(MVT::v4i32, InputV),
                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   }
   if (AnyExt && EltBits == 16 && Scale > 2) {
     int PSHUFDMask[4] = {Offset / 2, -1,
                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                          DAG.getBitcast(MVT::v4i32, InputV),
                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
     int PSHUFWMask[4] = {1, -1, -1, -1};
     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
     return DAG.getBitcast(
         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
                         DAG.getBitcast(MVT::v8i16, InputV),
                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
   }
 
   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
   // to 64-bits.
   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
     assert(VT.is128BitVector() && "Unexpected vector width!");
 
     int LoIdx = Offset * EltBits;
     SDValue Lo = DAG.getBitcast(
         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
                                 DAG.getConstant(EltBits, DL, MVT::i8),
                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
 
     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
         !SafeOffset(Offset + 1))
       return DAG.getBitcast(VT, Lo);
 
     int HiIdx = (Offset + 1) * EltBits;
     SDValue Hi = DAG.getBitcast(
         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
                                 DAG.getConstant(EltBits, DL, MVT::i8),
                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
     return DAG.getBitcast(VT,
                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
   }
 
   // If this would require more than 2 unpack instructions to expand, use
   // pshufb when available. We can only use more than 2 unpack instructions
   // when zero extending i8 elements which also makes it easier to use pshufb.
   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
     assert(NumElements == 16 && "Unexpected byte vector width!");
     SDValue PSHUFBMask[16];
     for (int i = 0; i < 16; ++i) {
       int Idx = Offset + (i / Scale);
       PSHUFBMask[i] = DAG.getConstant(
           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
     }
     InputV = DAG.getBitcast(MVT::v16i8, InputV);
     return DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
   }
 
   // If we are extending from an offset, ensure we start on a boundary that
   // we can unpack from.
   int AlignToUnpack = Offset % (NumElements / Scale);
   if (AlignToUnpack) {
     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
     for (int i = AlignToUnpack; i < NumElements; ++i)
       ShMask[i - AlignToUnpack] = i;
     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
     Offset -= AlignToUnpack;
   }
 
   // Otherwise emit a sequence of unpacks.
   do {
     unsigned UnpackLoHi = X86ISD::UNPCKL;
     if (Offset >= (NumElements / 2)) {
       UnpackLoHi = X86ISD::UNPCKH;
       Offset -= (NumElements / 2);
     }
 
     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
                          : getZeroVector(InputVT, Subtarget, DAG, DL);
     InputV = DAG.getBitcast(InputVT, InputV);
     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
     Scale /= 2;
     EltBits *= 2;
     NumElements /= 2;
   } while (Scale > 1);
   return DAG.getBitcast(VT, InputV);
 }
 
 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
 ///
 /// This routine will try to do everything in its power to cleverly lower
 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
 /// check for the profitability of this lowering,  it tries to aggressively
 /// match this pattern. It will use all of the micro-architectural details it
 /// can to emit an efficient lowering. It handles both blends with all-zero
 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
 /// masking out later).
 ///
 /// The reason we have dedicated lowering for zext-style shuffles is that they
 /// are both incredibly common and often quite performance sensitive.
 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   int Bits = VT.getSizeInBits();
   int NumLanes = Bits / 128;
   int NumElements = VT.getVectorNumElements();
   int NumEltsPerLane = NumElements / NumLanes;
   assert(VT.getScalarSizeInBits() <= 32 &&
          "Exceeds 32-bit integer zero extension limit");
   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
 
   // Define a helper function to check a particular ext-scale and lower to it if
   // valid.
   auto Lower = [&](int Scale) -> SDValue {
     SDValue InputV;
     bool AnyExt = true;
     int Offset = 0;
     int Matches = 0;
     for (int i = 0; i < NumElements; ++i) {
       int M = Mask[i];
       if (M < 0)
         continue; // Valid anywhere but doesn't tell us anything.
       if (i % Scale != 0) {
         // Each of the extended elements need to be zeroable.
         if (!Zeroable[i])
           return SDValue();
 
         // We no longer are in the anyext case.
         AnyExt = false;
         continue;
       }
 
       // Each of the base elements needs to be consecutive indices into the
       // same input vector.
       SDValue V = M < NumElements ? V1 : V2;
       M = M % NumElements;
       if (!InputV) {
         InputV = V;
         Offset = M - (i / Scale);
       } else if (InputV != V)
         return SDValue(); // Flip-flopping inputs.
 
       // Offset must start in the lowest 128-bit lane or at the start of an
       // upper lane.
       // FIXME: Is it ever worth allowing a negative base offset?
       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
             (Offset % NumEltsPerLane) == 0))
         return SDValue();
 
       // If we are offsetting, all referenced entries must come from the same
       // lane.
       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
         return SDValue();
 
       if ((M % NumElements) != (Offset + (i / Scale)))
         return SDValue(); // Non-consecutive strided elements.
       Matches++;
     }
 
     // If we fail to find an input, we have a zero-shuffle which should always
     // have already been handled.
     // FIXME: Maybe handle this here in case during blending we end up with one?
     if (!InputV)
       return SDValue();
 
     // If we are offsetting, don't extend if we only match a single input, we
     // can always do better by using a basic PSHUF or PUNPCK.
     if (Offset != 0 && Matches < 2)
       return SDValue();
 
     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
   };
 
   // The widest scale possible for extending is to a 64-bit integer.
   assert(Bits % 64 == 0 &&
          "The number of bits in a vector must be divisible by 64 on x86!");
   int NumExtElements = Bits / 64;
 
   // Each iteration, try extending the elements half as much, but into twice as
   // many elements.
   for (; NumExtElements < NumElements; NumExtElements *= 2) {
     assert(NumElements % NumExtElements == 0 &&
            "The input vector size must be divisible by the extended size.");
     if (SDValue V = Lower(NumElements / NumExtElements))
       return V;
   }
 
   // General extends failed, but 128-bit vectors may be able to use MOVQ.
   if (Bits != 128)
     return SDValue();
 
   // Returns one of the source operands if the shuffle can be reduced to a
   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
   auto CanZExtLowHalf = [&]() {
     for (int i = NumElements / 2; i != NumElements; ++i)
       if (!Zeroable[i])
         return SDValue();
     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
       return V1;
     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
       return V2;
     return SDValue();
   };
 
   if (SDValue V = CanZExtLowHalf()) {
     V = DAG.getBitcast(MVT::v2i64, V);
     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
     return DAG.getBitcast(VT, V);
   }
 
   // No viable ext lowering found.
   return SDValue();
 }
 
 /// \brief Try to get a scalar value for a specific element of a vector.
 ///
 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
                                               SelectionDAG &DAG) {
   MVT VT = V.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   V = peekThroughBitcasts(V);
 
   // If the bitcasts shift the element size, we can't extract an equivalent
   // element from it.
   MVT NewVT = V.getSimpleValueType();
   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
     return SDValue();
 
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
     // Ensure the scalar operand is the same size as the destination.
     // FIXME: Add support for scalar truncation where possible.
     SDValue S = V.getOperand(Idx);
     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
       return DAG.getBitcast(EltVT, S);
   }
 
   return SDValue();
 }
 
 /// \brief Helper to test for a load that can be folded with x86 shuffles.
 ///
 /// This is particularly important because the set of instructions varies
 /// significantly based on whether the operand is a load or not.
 static bool isShuffleFoldableLoad(SDValue V) {
   V = peekThroughBitcasts(V);
   return ISD::isNON_EXTLoad(V.getNode());
 }
 
 /// \brief Try to lower insertion of a single element into a zero vector.
 ///
 /// This is a common pattern that we have especially efficient patterns to lower
 /// across all subtarget feature sets.
 static SDValue lowerVectorShuffleAsElementInsertion(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
 
   int V2Index =
       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
       Mask.begin();
   bool IsV1Zeroable = true;
   for (int i = 0, Size = Mask.size(); i < Size; ++i)
     if (i != V2Index && !Zeroable[i]) {
       IsV1Zeroable = false;
       break;
     }
 
   // Check for a single input from a SCALAR_TO_VECTOR node.
   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
   // all the smarts here sunk into that routine. However, the current
   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   // vector shuffle lowering is dead.
   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
                                                DAG);
   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
     // We need to zext the scalar if it is smaller than an i32.
     V2S = DAG.getBitcast(EltVT, V2S);
     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
       // Using zext to expand a narrow element won't work for non-zero
       // insertions.
       if (!IsV1Zeroable)
         return SDValue();
 
       // Zero-extend directly to i32.
       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
     }
     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
              EltVT == MVT::i16) {
     // Either not inserting from the low element of the input or the input
     // element size is too small to use VZEXT_MOVL to clear the high bits.
     return SDValue();
   }
 
   if (!IsV1Zeroable) {
     // If V1 can't be treated as a zero vector we have fewer options to lower
     // this. We can't support integer vectors or non-zero targets cheaply, and
     // the V1 elements can't be permuted in any way.
     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
     if (!VT.isFloatingPoint() || V2Index != 0)
       return SDValue();
     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
     V1Mask[V2Index] = -1;
     if (!isNoopShuffleMask(V1Mask))
       return SDValue();
     if (!VT.is128BitVector())
       return SDValue();
 
     // Otherwise, use MOVSD or MOVSS.
     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
            "Only two types of floating point element types to handle!");
     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
                        ExtVT, V1, V2);
   }
 
   // This lowering only works for the low element with floating point vectors.
   if (VT.isFloatingPoint() && V2Index != 0)
     return SDValue();
 
   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
   if (ExtVT != VT)
     V2 = DAG.getBitcast(VT, V2);
 
   if (V2Index != 0) {
     // If we have 4 or fewer lanes we can cheaply shuffle the element into
     // the desired position. Otherwise it is more efficient to do a vector
     // shift left. We know that we can do a vector shift left because all
     // the inputs are zero.
     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
       V2Shuffle[V2Index] = 0;
       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
     } else {
       V2 = DAG.getBitcast(MVT::v16i8, V2);
       V2 = DAG.getNode(
           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
                               DAG.getDataLayout(), VT)));
       V2 = DAG.getBitcast(VT, V2);
     }
   }
   return V2;
 }
 
 /// Try to lower broadcast of a single - truncated - integer element,
 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
 ///
 /// This assumes we have AVX2.
 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
                                                   SDValue V0, int BroadcastIdx,
                                                   const X86Subtarget &Subtarget,
                                                   SelectionDAG &DAG) {
   assert(Subtarget.hasAVX2() &&
          "We can only lower integer broadcasts with AVX2!");
 
   EVT EltVT = VT.getVectorElementType();
   EVT V0VT = V0.getValueType();
 
   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
 
   EVT V0EltVT = V0VT.getVectorElementType();
   if (!V0EltVT.isInteger())
     return SDValue();
 
   const unsigned EltSize = EltVT.getSizeInBits();
   const unsigned V0EltSize = V0EltVT.getSizeInBits();
 
   // This is only a truncation if the original element type is larger.
   if (V0EltSize <= EltSize)
     return SDValue();
 
   assert(((V0EltSize % EltSize) == 0) &&
          "Scalar type sizes must all be powers of 2 on x86!");
 
   const unsigned V0Opc = V0.getOpcode();
   const unsigned Scale = V0EltSize / EltSize;
   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
 
   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
       V0Opc != ISD::BUILD_VECTOR)
     return SDValue();
 
   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
 
   // If we're extracting non-least-significant bits, shift so we can truncate.
   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
   if (const int OffsetIdx = BroadcastIdx % Scale)
     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
             DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
 
   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
 }
 
 /// \brief Try to lower broadcast of a single element.
 ///
 /// For convenience, this code also bundles all of the subtarget feature set
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
 /// a convenient way to factor it out.
 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
                                              SDValue V1, SDValue V2,
                                              ArrayRef<int> Mask,
                                              const X86Subtarget &Subtarget,
                                              SelectionDAG &DAG) {
   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
         (Subtarget.hasAVX2() && VT.isInteger())))
     return SDValue();
 
   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
   // we can only broadcast from a register with AVX2.
   unsigned NumElts = Mask.size();
   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
                         ? X86ISD::MOVDDUP
                         : X86ISD::VBROADCAST;
   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
 
   // Check that the mask is a broadcast.
   int BroadcastIdx = -1;
   for (int i = 0; i != (int)NumElts; ++i) {
     SmallVector<int, 8> BroadcastMask(NumElts, i);
     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
       BroadcastIdx = i;
       break;
     }
   }
 
   if (BroadcastIdx < 0)
     return SDValue();
   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
                                             "a sorted mask where the broadcast "
                                             "comes from V1.");
 
   // Go up the chain of (vector) values to find a scalar load that we can
   // combine with the broadcast.
   SDValue V = V1;
   for (;;) {
     switch (V.getOpcode()) {
     case ISD::BITCAST: {
       // Peek through bitcasts as long as BroadcastIdx can be adjusted.
       SDValue VSrc = V.getOperand(0);
       unsigned NumEltBits = V.getScalarValueSizeInBits();
       unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
       if ((NumEltBits % NumSrcBits) == 0)
         BroadcastIdx *= (NumEltBits / NumSrcBits);
       else if ((NumSrcBits % NumEltBits) == 0 &&
                (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
         BroadcastIdx /= (NumSrcBits / NumEltBits);
       else
         break;
       V = VSrc;
       continue;
     }
     case ISD::CONCAT_VECTORS: {
       int OperandSize = Mask.size() / V.getNumOperands();
       V = V.getOperand(BroadcastIdx / OperandSize);
       BroadcastIdx %= OperandSize;
       continue;
     }
     case ISD::INSERT_SUBVECTOR: {
       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
       if (!ConstantIdx)
         break;
 
       int BeginIdx = (int)ConstantIdx->getZExtValue();
       int EndIdx =
           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
         BroadcastIdx -= BeginIdx;
         V = VInner;
       } else {
         V = VOuter;
       }
       continue;
     }
     }
     break;
   }
 
   // Ensure the source vector and BroadcastIdx are for a suitable type.
   if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
     unsigned NumEltBits = VT.getScalarSizeInBits();
     unsigned NumSrcBits = V.getScalarValueSizeInBits();
     if ((NumSrcBits % NumEltBits) == 0)
       BroadcastIdx *= (NumSrcBits / NumEltBits);
     else if ((NumEltBits % NumSrcBits) == 0 &&
              (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
       BroadcastIdx /= (NumEltBits / NumSrcBits);
     else
       return SDValue();
 
     unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
     MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
     V = DAG.getBitcast(SrcVT, V);
   }
 
   // Check if this is a broadcast of a scalar. We special case lowering
   // for scalars so that we can more effectively fold with loads.
   // First, look through bitcast: if the original value has a larger element
   // type than the shuffle, the broadcast element is in essence truncated.
   // Make that explicit to ease folding.
   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
       return TruncBroadcast;
 
   MVT BroadcastVT = VT;
 
   // Peek through any bitcast (only useful for loads).
   SDValue BC = peekThroughBitcasts(V);
 
   // Also check the simpler case, where we can directly reuse the scalar.
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
     V = V.getOperand(BroadcastIdx);
 
     // If we can't broadcast from a register, check that the input is a load.
     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
       return SDValue();
   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
       Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
                    ? X86ISD::MOVDDUP
                    : Opcode;
     }
 
     // If we are broadcasting a load that is only used by the shuffle
     // then we can reduce the vector load to the broadcasted scalar load.
     LoadSDNode *Ld = cast<LoadSDNode>(BC);
     SDValue BaseAddr = Ld->getOperand(1);
     EVT SVT = BroadcastVT.getScalarType();
     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
                     DAG.getMachineFunction().getMachineMemOperand(
                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
     DAG.makeEquivalentMemoryOrdering(Ld, V);
   } else if (!BroadcastFromReg) {
     // We can't broadcast from a vector register.
     return SDValue();
   } else if (BroadcastIdx != 0) {
     // We can only broadcast from the zero-element of a vector register,
     // but it can be advantageous to broadcast from the zero-element of a
     // subvector.
     if (!VT.is256BitVector() && !VT.is512BitVector())
       return SDValue();
 
     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
     if (VT == MVT::v4f64 || VT == MVT::v4i64)
       return SDValue();
 
     // Only broadcast the zero-element of a 128-bit subvector.
     unsigned EltSize = VT.getScalarSizeInBits();
     if (((BroadcastIdx * EltSize) % 128) != 0)
       return SDValue();
 
     // The shuffle input might have been a bitcast we looked through; look at
     // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
     // later bitcast it to BroadcastVT.
     assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
            "Unexpected vector element size");
     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
            "Unexpected vector size");
     V = extract128BitVector(V, BroadcastIdx, DAG, DL);
   }
 
   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
                     DAG.getBitcast(MVT::f64, V));
 
   // Bitcast back to the same scalar type as BroadcastVT.
   MVT SrcVT = V.getSimpleValueType();
   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
            "Unexpected vector element size");
     if (SrcVT.isVector()) {
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
     } else {
       SrcVT = BroadcastVT.getScalarType();
     }
     V = DAG.getBitcast(SrcVT, V);
   }
 
   // 32-bit targets need to load i64 as a f64 and then bitcast the result.
   if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
     V = DAG.getBitcast(MVT::f64, V);
     unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
   }
 
   // We only support broadcasting from 128-bit vectors to minimize the
   // number of patterns we need to deal with in isel. So extract down to
   // 128-bits, removing as many bitcasts as possible.
   if (SrcVT.getSizeInBits() > 128) {
     MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
                                  128 / SrcVT.getScalarSizeInBits());
     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
     V = DAG.getBitcast(ExtVT, V);
   }
 
   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
 }
 
 // Check for whether we can use INSERTPS to perform the shuffle. We only use
 // INSERTPS when the V1 elements are already in the correct locations
 // because otherwise we can just always use two SHUFPS instructions which
 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
 // perform INSERTPS if a single V1 element is out of place and all V2
 // elements are zeroable.
 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
                                          unsigned &InsertPSMask,
                                          const APInt &Zeroable,
                                          ArrayRef<int> Mask,
                                          SelectionDAG &DAG) {
   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   // Attempt to match INSERTPS with one element from VA or VB being
   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
   // are updated.
   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
                              ArrayRef<int> CandidateMask) {
     unsigned ZMask = 0;
     int VADstIndex = -1;
     int VBDstIndex = -1;
     bool VAUsedInPlace = false;
 
     for (int i = 0; i < 4; ++i) {
       // Synthesize a zero mask from the zeroable elements (includes undefs).
       if (Zeroable[i]) {
         ZMask |= 1 << i;
         continue;
       }
 
       // Flag if we use any VA inputs in place.
       if (i == CandidateMask[i]) {
         VAUsedInPlace = true;
         continue;
       }
 
       // We can only insert a single non-zeroable element.
       if (VADstIndex >= 0 || VBDstIndex >= 0)
         return false;
 
       if (CandidateMask[i] < 4) {
         // VA input out of place for insertion.
         VADstIndex = i;
       } else {
         // VB input for insertion.
         VBDstIndex = i;
       }
     }
 
     // Don't bother if we have no (non-zeroable) element for insertion.
     if (VADstIndex < 0 && VBDstIndex < 0)
       return false;
 
     // Determine element insertion src/dst indices. The src index is from the
     // start of the inserted vector, not the start of the concatenated vector.
     unsigned VBSrcIndex = 0;
     if (VADstIndex >= 0) {
       // If we have a VA input out of place, we use VA as the V2 element
       // insertion and don't use the original V2 at all.
       VBSrcIndex = CandidateMask[VADstIndex];
       VBDstIndex = VADstIndex;
       VB = VA;
     } else {
       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
     }
 
     // If no V1 inputs are used in place, then the result is created only from
     // the zero mask and the V2 insertion - so remove V1 dependency.
     if (!VAUsedInPlace)
       VA = DAG.getUNDEF(MVT::v4f32);
 
     // Update V1, V2 and InsertPSMask accordingly.
     V1 = VA;
     V2 = VB;
 
     // Insert the V2 element into the desired position.
     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
     return true;
   };
 
   if (matchAsInsertPS(V1, V2, Mask))
     return true;
 
   // Commute and try again.
   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
   ShuffleVectorSDNode::commuteMask(CommutedMask);
   if (matchAsInsertPS(V2, V1, CommutedMask))
     return true;
 
   return false;
 }
 
 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
                                             const APInt &Zeroable,
                                             SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
 
   // Attempt to match the insertps pattern.
   unsigned InsertPSMask;
   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
     return SDValue();
 
   // Insert the V2 element into the desired position.
   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
 }
 
 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
 /// UNPCK instruction.
 ///
 /// This specifically targets cases where we end up with alternating between
 /// the two inputs, and so can permute them into something that feeds a single
 /// UNPCK instruction. Note that this routine only targets integer vectors
 /// because for floating point vectors we have a generalized SHUFPS lowering
 /// strategy that handles everything that doesn't *exactly* match an unpack,
 /// making this clever lowering unnecessary.
 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
                                                     SDValue V1, SDValue V2,
                                                     ArrayRef<int> Mask,
                                                     SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() &&
          "This routine only supports integer vectors.");
   assert(VT.is128BitVector() &&
          "This routine only works on 128-bit vectors.");
   assert(!V2.isUndef() &&
          "This routine should only be used when blending two inputs.");
   assert(Mask.size() >= 2 && "Single element masks are invalid.");
 
   int Size = Mask.size();
 
   int NumLoInputs =
       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
   int NumHiInputs =
       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
 
   bool UnpackLo = NumLoInputs >= NumHiInputs;
 
   auto TryUnpack = [&](int ScalarSize, int Scale) {
     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
 
     for (int i = 0; i < Size; ++i) {
       if (Mask[i] < 0)
         continue;
 
       // Each element of the unpack contains Scale elements from this mask.
       int UnpackIdx = i / Scale;
 
       // We only handle the case where V1 feeds the first slots of the unpack.
       // We rely on canonicalization to ensure this is the case.
       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
         return SDValue();
 
       // Setup the mask for this input. The indexing is tricky as we have to
       // handle the unpack stride.
       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
           Mask[i] % Size;
     }
 
     // If we will have to shuffle both inputs to use the unpack, check whether
     // we can just unpack first and shuffle the result. If so, skip this unpack.
     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
         !isNoopShuffleMask(V2Mask))
       return SDValue();
 
     // Shuffle the inputs into place.
     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
 
     // Cast the inputs to the type we will use to unpack them.
     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
     V1 = DAG.getBitcast(UnpackVT, V1);
     V2 = DAG.getBitcast(UnpackVT, V2);
 
     // Unpack the inputs and cast the result back to the desired type.
     return DAG.getBitcast(
         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
                         UnpackVT, V1, V2));
   };
 
   // We try each unpack from the largest to the smallest to try and find one
   // that fits this mask.
   int OrigScalarSize = VT.getScalarSizeInBits();
   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
       return Unpack;
 
   // If none of the unpack-rooted lowerings worked (or were profitable) try an
   // initial unpack.
   if (NumLoInputs == 0 || NumHiInputs == 0) {
     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
            "We have to have *some* inputs!");
     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
 
     // FIXME: We could consider the total complexity of the permute of each
     // possible unpacking. Or at the least we should consider how many
     // half-crossings are created.
     // FIXME: We could consider commuting the unpacks.
 
     SmallVector<int, 32> PermMask((unsigned)Size, -1);
     for (int i = 0; i < Size; ++i) {
       if (Mask[i] < 0)
         continue;
 
       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
 
       PermMask[i] =
           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
     }
     return DAG.getVectorShuffle(
         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
                             DL, VT, V1, V2),
         DAG.getUNDEF(VT), PermMask);
   }
 
   return SDValue();
 }
 
 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
 ///
 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
 /// support for floating point shuffles but not integer shuffles. These
 /// instructions will incur a domain crossing penalty on some chips though so
 /// it is better to avoid lowering through this for integer vectors where
 /// possible.
 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
 
   if (V2.isUndef()) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Straight shuffle of a single input vector. Simulate this by using the
     // single input as both of the "inputs" to this instruction..
     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
 
     if (Subtarget.hasAVX()) {
       // If we have AVX, we can use VPERMILPS which will allow folding a load
       // into the shuffle.
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
     }
 
     return DAG.getNode(
         X86ISD::SHUFP, DL, MVT::v2f64,
         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   }
   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
 
   // If we have a single input, insert that into V1 if we can do so cheaply.
   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return Insertion;
     // Try inverting the insertion since for v2 masks it is easy to do and we
     // can't reliably sort the mask one way or the other.
     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
       return Insertion;
   }
 
   // Try to use one of the special instruction patterns to handle two common
   // blend patterns if a zero-blend above didn't work.
   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
       // We can either use a special instruction to load over the low double or
       // to move just the low double.
       return DAG.getNode(
           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
           DL, MVT::v2f64, V2,
           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
 
   if (Subtarget.hasSSE41())
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
     return V;
 
   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
 }
 
 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
 ///
 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
 /// the integer unit to minimize domain crossing penalties. However, for blends
 /// it falls back to the floating point shuffle operation with appropriate bit
 /// casting.
 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
 
   if (V2.isUndef()) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
     // We have to map the mask as it is actually a v4i32 shuffle instruction.
     V1 = DAG.getBitcast(MVT::v4i32, V1);
     int WidenedMask[4] = {
         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
     return DAG.getBitcast(
         MVT::v2i64,
         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
   }
   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // When loading a scalar and then shuffling it into a vector we can often do
   // the insertion cheaply.
   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return Insertion;
   // Try inverting the insertion since for v2 masks it is easy to do and we
   // can't reliably sort the mask one way or the other.
   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
     return Insertion;
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
   bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
     return V;
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   if (Subtarget.hasSSSE3()) {
     if (Subtarget.hasVLX())
       if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
                                                       Mask, Subtarget, DAG))
         return Rotate;
 
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
   }
 
   // If we have direct support for blends, we should lower by decomposing into
   // a permute. That will be faster than the domain cross.
   if (IsBlendSupported)
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
                                                       Mask, DAG);
 
   // We implement this with SHUFPD which is pretty lame because it will likely
   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   // However, all the alternatives are still more cycles and newer chips don't
   // have this problem. It would be really nice if x86 had better shuffles here.
   V1 = DAG.getBitcast(MVT::v2f64, V1);
   V2 = DAG.getBitcast(MVT::v2f64, V2);
   return DAG.getBitcast(MVT::v2i64,
                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
 }
 
 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
 ///
 /// This is used to disable more specialized lowerings when the shufps lowering
 /// will happen to be efficient.
 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
   // This routine only handles 128-bit shufps.
   assert(Mask.size() == 4 && "Unsupported mask size!");
   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
 
   // To lower with a single SHUFPS we need to have the low half and high half
   // each requiring a single input.
   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
     return false;
   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
     return false;
 
   return true;
 }
 
 /// \brief Lower a vector shuffle using the SHUFPS instruction.
 ///
 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
 /// It makes no assumptions about whether this is the *best* lowering, it simply
 /// uses it.
 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
   SDValue LowV = V1, HighV = V2;
   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 1) {
     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
 
     // Compute the index adjacent to V2Index and in the same half by toggling
     // the low bit.
     int V2AdjIndex = V2Index ^ 1;
 
     if (Mask[V2AdjIndex] < 0) {
       // Handles all the cases where we have a single V2 element and an undef.
       // This will only ever happen in the high lanes because we commute the
       // vector otherwise.
       if (V2Index < 2)
         std::swap(LowV, HighV);
       NewMask[V2Index] -= 4;
     } else {
       // Handle the case where the V2 element ends up adjacent to a V1 element.
       // To make this work, blend them together as the first step.
       int V1Index = V2AdjIndex;
       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
 
       // Now proceed to reconstruct the final blend as we have the necessary
       // high or low half formed.
       if (V2Index < 2) {
         LowV = V2;
         HighV = V1;
       } else {
         HighV = V2;
       }
       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
     }
   } else if (NumV2Elements == 2) {
     if (Mask[0] < 4 && Mask[1] < 4) {
       // Handle the easy case where we have V1 in the low lanes and V2 in the
       // high lanes.
       NewMask[2] -= 4;
       NewMask[3] -= 4;
     } else if (Mask[2] < 4 && Mask[3] < 4) {
       // We also handle the reversed case because this utility may get called
       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
       // arrange things in the right direction.
       NewMask[0] -= 4;
       NewMask[1] -= 4;
       HighV = V1;
       LowV = V2;
     } else {
       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
       // trying to place elements directly, just blend them and set up the final
       // shuffle to place them.
 
       // The first two blend mask elements are for V1, the second two are for
       // V2.
       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
                           Mask[2] < 4 ? Mask[2] : Mask[3],
                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
 
       // Now we do a normal shuffle of V1 by giving V1 as both operands to
       // a blend.
       LowV = HighV = V1;
       NewMask[0] = Mask[0] < 4 ? 0 : 2;
       NewMask[1] = Mask[0] < 4 ? 2 : 0;
       NewMask[2] = Mask[2] < 4 ? 1 : 3;
       NewMask[3] = Mask[2] < 4 ? 3 : 1;
     }
   }
   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
 }
 
 /// \brief Lower 4-lane 32-bit floating point shuffles.
 ///
 /// Uses instructions exclusively from the floating point unit to minimize
 /// domain crossing penalties, as these are sufficient to implement all v4f32
 /// shuffles.
 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Use even/odd duplicate instructions for masks that match their pattern.
     if (Subtarget.hasSSE3()) {
       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
     }
 
     if (Subtarget.hasAVX()) {
       // If we have AVX, we can use VPERMILPS which will allow folding a load
       // into the shuffle.
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
     }
 
     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
     // in SSE1 because otherwise they are widened to v2f64 and never get here.
     if (!Subtarget.hasSSE2()) {
       if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
       if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
     }
 
     // Otherwise, use a straight shuffle of a single input vector. We pass the
     // input vector to both operands to simulate this with a SHUFPS.
     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   }
 
   // There are special ways we can lower some single-element blends. However, we
   // have custom ways we can lower more complex single-element blends below that
   // we defer to if both this and BLENDPS fail to match, so restrict this to
   // when the V2 input is targeting element 0 of the mask -- that is the fast
   // case here.
   if (NumV2Elements == 1 && Mask[0] >= 4)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return V;
 
   if (Subtarget.hasSSE41()) {
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
     // Use INSERTPS if we can complete the shuffle efficiently.
     if (SDValue V =
             lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
       return V;
 
     if (!isSingleSHUFPSMask(Mask))
       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
               DL, MVT::v4f32, V1, V2, Mask, DAG))
         return BlendPerm;
   }
 
   // Use low/high mov instructions. These are only valid in SSE1 because
   // otherwise they are widened to v2f64 and never get here.
   if (!Subtarget.hasSSE2()) {
     if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
     if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
   }
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
     return V;
 
   // Otherwise fall back to a SHUFPS lowering strategy.
   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
 }
 
 /// \brief Lower 4-lane i32 vector shuffles.
 ///
 /// We try to handle these with integer-domain shuffles where we can, but for
 /// blends we use the floating point domain blend instructions.
 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
     // We coerce the shuffle pattern to be compatible with UNPCK instructions
     // but we aren't actually going to use the UNPCK instruction because doing
     // so prevents folding a load into this instruction or making a copy.
     const int UnpackLoMask[] = {0, 0, 1, 1};
     const int UnpackHiMask[] = {2, 2, 3, 3};
     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
       Mask = UnpackLoMask;
     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
       Mask = UnpackHiMask;
 
     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   }
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return V;
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
   bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
                                                    Zeroable, DAG))
     return Masked;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
     return V;
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   if (Subtarget.hasSSSE3()) {
     if (Subtarget.hasVLX())
       if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
                                                       Mask, Subtarget, DAG))
         return Rotate;
 
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
   }
 
   // Assume that a single SHUFPS is faster than an alternative sequence of
   // multiple instructions (even if the CPU has a domain penalty).
   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   if (!isSingleSHUFPSMask(Mask)) {
     // If we have direct support for blends, we should lower by decomposing into
     // a permute. That will be faster than the domain cross.
     if (IsBlendSupported)
       return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
                                                         Mask, DAG);
 
     // Try to lower by permuting the inputs into an unpack instruction.
     if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
             DL, MVT::v4i32, V1, V2, Mask, DAG))
       return Unpack;
   }
 
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
   // up the inputs, bypassing domain shift penalties that we would incur if we
   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   // relevant.
   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
   return DAG.getBitcast(MVT::v4i32, ShufPS);
 }
 
 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
 /// shuffle lowering, and the most complex part.
 ///
 /// The lowering strategy is to try to form pairs of input lanes which are
 /// targeted at the same half of the final vector, and then use a dword shuffle
 /// to place them onto the right half, and finally unpack the paired lanes into
 /// their final position.
 ///
 /// The exact breakdown of how to form these dword pairs and align them on the
 /// correct sides is really tricky. See the comments within the function for
 /// more of the details.
 ///
 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
 /// vector, form the analogous 128-bit 8-element Mask.
 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
 
   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
 
   // Attempt to directly match PSHUFLW or PSHUFHW.
   if (isUndefOrInRange(LoMask, 0, 4) &&
       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
   }
   if (isUndefOrInRange(HiMask, 4, 8) &&
       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
     for (int i = 0; i != 4; ++i)
       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
   }
 
   SmallVector<int, 4> LoInputs;
   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
   std::sort(LoInputs.begin(), LoInputs.end());
   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   SmallVector<int, 4> HiInputs;
   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
   std::sort(HiInputs.begin(), HiInputs.end());
   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   int NumLToL =
       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
   int NumHToL = LoInputs.size() - NumLToL;
   int NumLToH =
       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
   int NumHToH = HiInputs.size() - NumLToH;
   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
 
   // If we are shuffling values from one half - check how many different DWORD
   // pairs we need to create. If only 1 or 2 then we can perform this as a
   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
     V = DAG.getNode(ShufWOp, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
     V = DAG.getBitcast(PSHUFDVT, V);
     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
     return DAG.getBitcast(VT, V);
   };
 
   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
     int PSHUFDMask[4] = { -1, -1, -1, -1 };
     SmallVector<std::pair<int, int>, 4> DWordPairs;
     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
 
     // Collect the different DWORD pairs.
     for (int DWord = 0; DWord != 4; ++DWord) {
       int M0 = Mask[2 * DWord + 0];
       int M1 = Mask[2 * DWord + 1];
       M0 = (M0 >= 0 ? M0 % 4 : M0);
       M1 = (M1 >= 0 ? M1 % 4 : M1);
       if (M0 < 0 && M1 < 0)
         continue;
 
       bool Match = false;
       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
         auto &DWordPair = DWordPairs[j];
         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
           PSHUFDMask[DWord] = DOffset + j;
           Match = true;
           break;
         }
       }
       if (!Match) {
         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
         DWordPairs.push_back(std::make_pair(M0, M1));
       }
     }
 
     if (DWordPairs.size() <= 2) {
       DWordPairs.resize(2, std::make_pair(-1, -1));
       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
                               DWordPairs[1].first, DWordPairs[1].second};
       if ((NumHToL + NumHToH) == 0)
         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
       if ((NumLToL + NumLToH) == 0)
         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
     }
   }
 
   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   // such inputs we can swap two of the dwords across the half mark and end up
   // with <=2 inputs to each half in each half. Once there, we can fall through
   // to the generic code below. For example:
   //
   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
   //
   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
   // and an existing 2-into-2 on the other half. In this case we may have to
   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
   // because any other situation (including a 3-into-1 or 1-into-3 in the other
   // half than the one we target for fixing) will be fixed when we re-enter this
   // path. We will also combine away any sequence of PSHUFD instructions that
   // result into a single instruction. Here is an example of the tricky case:
   //
   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
   //
   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
   //
   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
   //
   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
   //
   // The result is fine to be handled by the generic logic.
   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
                           int AOffset, int BOffset) {
     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
            "Must call this with A having 3 or 1 inputs from the A half.");
     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
            "Must call this with B having 1 or 3 inputs from the B half.");
     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
 
     bool ThreeAInputs = AToAInputs.size() == 3;
 
     // Compute the index of dword with only one word among the three inputs in
     // a half by taking the sum of the half with three inputs and subtracting
     // the sum of the actual three inputs. The difference is the remaining
     // slot.
     int ADWord, BDWord;
     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
     int TripleNonInputIdx =
         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
     TripleDWord = TripleNonInputIdx / 2;
 
     // We use xor with one to compute the adjacent DWord to whichever one the
     // OneInput is in.
     OneInputDWord = (OneInput / 2) ^ 1;
 
     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
     // and BToA inputs. If there is also such a problem with the BToB and AToB
     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
     // is essential that we don't *create* a 3<-1 as then we might oscillate.
     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
       // Compute how many inputs will be flipped by swapping these DWords. We
       // need
       // to balance this to ensure we don't form a 3-1 shuffle in the other
       // half.
       int NumFlippedAToBInputs =
           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
       int NumFlippedBToBInputs =
           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
       if ((NumFlippedAToBInputs == 1 &&
            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
           (NumFlippedBToBInputs == 1 &&
            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
         // We choose whether to fix the A half or B half based on whether that
         // half has zero flipped inputs. At zero, we may not be able to fix it
         // with that half. We also bias towards fixing the B half because that
         // will more commonly be the high half, and we have to bias one way.
         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
                                                        ArrayRef<int> Inputs) {
           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
           // Determine whether the free index is in the flipped dword or the
           // unflipped dword based on where the pinned index is. We use this bit
           // in an xor to conditionally select the adjacent dword.
           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
           if (IsFixIdxInput == IsFixFreeIdxInput)
             FixFreeIdx += 1;
           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
           assert(IsFixIdxInput != IsFixFreeIdxInput &&
                  "We need to be changing the number of flipped inputs!");
           int PSHUFHalfMask[] = {0, 1, 2, 3};
           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
           V = DAG.getNode(
               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
 
           for (int &M : Mask)
             if (M >= 0 && M == FixIdx)
               M = FixFreeIdx;
             else if (M >= 0 && M == FixFreeIdx)
               M = FixIdx;
         };
         if (NumFlippedBToBInputs != 0) {
           int BPinnedIdx =
               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
         } else {
           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
         }
       }
     }
 
     int PSHUFDMask[] = {0, 1, 2, 3};
     PSHUFDMask[ADWord] = BDWord;
     PSHUFDMask[BDWord] = ADWord;
     V = DAG.getBitcast(
         VT,
         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
 
     // Adjust the mask to match the new locations of A and B.
     for (int &M : Mask)
       if (M >= 0 && M/2 == ADWord)
         M = 2 * BDWord + M % 2;
       else if (M >= 0 && M/2 == BDWord)
         M = 2 * ADWord + M % 2;
 
     // Recurse back into this routine to re-compute state now that this isn't
     // a 3 and 1 problem.
     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
                                                      DAG);
   };
   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
 
   // At this point there are at most two inputs to the low and high halves from
   // each half. That means the inputs can always be grouped into dwords and
   // those dwords can then be moved to the correct half with a dword shuffle.
   // We use at most one low and one high word shuffle to collect these paired
   // inputs into dwords, and finally a dword shuffle to place them.
   int PSHUFLMask[4] = {-1, -1, -1, -1};
   int PSHUFHMask[4] = {-1, -1, -1, -1};
   int PSHUFDMask[4] = {-1, -1, -1, -1};
 
   // First fix the masks for all the inputs that are staying in their
   // original halves. This will then dictate the targets of the cross-half
   // shuffles.
   auto fixInPlaceInputs =
       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
                     MutableArrayRef<int> SourceHalfMask,
                     MutableArrayRef<int> HalfMask, int HalfOffset) {
     if (InPlaceInputs.empty())
       return;
     if (InPlaceInputs.size() == 1) {
       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
           InPlaceInputs[0] - HalfOffset;
       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
       return;
     }
     if (IncomingInputs.empty()) {
       // Just fix all of the in place inputs.
       for (int Input : InPlaceInputs) {
         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
         PSHUFDMask[Input / 2] = Input / 2;
       }
       return;
     }
 
     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
         InPlaceInputs[0] - HalfOffset;
     // Put the second input next to the first so that they are packed into
     // a dword. We find the adjacent index by toggling the low bit.
     int AdjIndex = InPlaceInputs[0] ^ 1;
     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   };
   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
 
   // Now gather the cross-half inputs and place them into a free dword of
   // their target half.
   // FIXME: This operation could almost certainly be simplified dramatically to
   // look more like the 3-1 fixing operation.
   auto moveInputsToRightHalf = [&PSHUFDMask](
       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
       int DestOffset) {
     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
     };
     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
                                                int Word) {
       int LowWord = Word & ~1;
       int HighWord = Word | 1;
       return isWordClobbered(SourceHalfMask, LowWord) ||
              isWordClobbered(SourceHalfMask, HighWord);
     };
 
     if (IncomingInputs.empty())
       return;
 
     if (ExistingInputs.empty()) {
       // Map any dwords with inputs from them into the right half.
       for (int Input : IncomingInputs) {
         // If the source half mask maps over the inputs, turn those into
         // swaps and use the swapped lane.
         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
                 Input - SourceOffset;
             // We have to swap the uses in our half mask in one sweep.
             for (int &M : HalfMask)
               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
                 M = Input;
               else if (M == Input)
                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
           } else {
             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
                        Input - SourceOffset &&
                    "Previous placement doesn't match!");
           }
           // Note that this correctly re-maps both when we do a swap and when
           // we observe the other side of the swap above. We rely on that to
           // avoid swapping the members of the input list directly.
           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
         }
 
         // Map the input's dword into the correct half.
         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
         else
           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
                      Input / 2 &&
                  "Previous placement doesn't match!");
       }
 
       // And just directly shift any other-half mask elements to be same-half
       // as we will have mirrored the dword containing the element into the
       // same position within that half.
       for (int &M : HalfMask)
         if (M >= SourceOffset && M < SourceOffset + 4) {
           M = M - SourceOffset + DestOffset;
           assert(M >= 0 && "This should never wrap below zero!");
         }
       return;
     }
 
     // Ensure we have the input in a viable dword of its current half. This
     // is particularly tricky because the original position may be clobbered
     // by inputs being moved and *staying* in that half.
     if (IncomingInputs.size() == 1) {
       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
                          SourceOffset;
         SourceHalfMask[InputFixed - SourceOffset] =
             IncomingInputs[0] - SourceOffset;
         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
                      InputFixed);
         IncomingInputs[0] = InputFixed;
       }
     } else if (IncomingInputs.size() == 2) {
       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
         // We have two non-adjacent or clobbered inputs we need to extract from
         // the source half. To do this, we need to map them into some adjacent
         // dword slot in the source mask.
         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
                               IncomingInputs[1] - SourceOffset};
 
         // If there is a free slot in the source half mask adjacent to one of
         // the inputs, place the other input in it. We use (Index XOR 1) to
         // compute an adjacent index.
         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
           InputsFixed[1] = InputsFixed[0] ^ 1;
         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
           InputsFixed[0] = InputsFixed[1] ^ 1;
         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
           // The two inputs are in the same DWord but it is clobbered and the
           // adjacent DWord isn't used at all. Move both inputs to the free
           // slot.
           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
         } else {
           // The only way we hit this point is if there is no clobbering
           // (because there are no off-half inputs to this half) and there is no
           // free slot adjacent to one of the inputs. In this case, we have to
           // swap an input with a non-input.
           for (int i = 0; i < 4; ++i)
             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
                    "We can't handle any clobbers here!");
           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
                  "Cannot have adjacent inputs here!");
 
           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
 
           // We also have to update the final source mask in this case because
           // it may need to undo the above swap.
           for (int &M : FinalSourceHalfMask)
             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
               M = InputsFixed[1] + SourceOffset;
             else if (M == InputsFixed[1] + SourceOffset)
               M = (InputsFixed[0] ^ 1) + SourceOffset;
 
           InputsFixed[1] = InputsFixed[0] ^ 1;
         }
 
         // Point everything at the fixed inputs.
         for (int &M : HalfMask)
           if (M == IncomingInputs[0])
             M = InputsFixed[0] + SourceOffset;
           else if (M == IncomingInputs[1])
             M = InputsFixed[1] + SourceOffset;
 
         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
       }
     } else {
       llvm_unreachable("Unhandled input size!");
     }
 
     // Now hoist the DWord down to the right half.
     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
     for (int &M : HalfMask)
       for (int Input : IncomingInputs)
         if (M == Input)
           M = FreeDWord * 2 + Input % 2;
   };
   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
 
   // Now enact all the shuffles we've computed to move the inputs into their
   // target half.
   if (!isNoopShuffleMask(PSHUFLMask))
     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
   if (!isNoopShuffleMask(PSHUFHMask))
     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
   if (!isNoopShuffleMask(PSHUFDMask))
     V = DAG.getBitcast(
         VT,
         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
 
   // At this point, each half should contain all its inputs, and we can then
   // just shuffle them into their final position.
   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
          "Failed to lift all the high half inputs to the low mask!");
   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
          "Failed to lift all the low half inputs to the high mask!");
 
   // Do a half shuffle for the low mask.
   if (!isNoopShuffleMask(LoMask))
     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
 
   // Do a half shuffle with the high mask after shifting its values down.
   for (int &M : HiMask)
     if (M >= 0)
       M -= 4;
   if (!isNoopShuffleMask(HiMask))
     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
 
   return V;
 }
 
 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
 /// blend if only one input is used.
 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
     bool &V2InUse) {
   SDValue V1Mask[16];
   SDValue V2Mask[16];
   V1InUse = false;
   V2InUse = false;
 
   int Size = Mask.size();
   int Scale = 16 / Size;
   for (int i = 0; i < 16; ++i) {
     if (Mask[i / Scale] < 0) {
       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
     } else {
       const int ZeroMask = 0x80;
       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
                                           : ZeroMask;
       int V2Idx = Mask[i / Scale] < Size
                       ? ZeroMask
                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
       if (Zeroable[i / Scale])
         V1Idx = V2Idx = ZeroMask;
       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
       V1InUse |= (ZeroMask != V1Idx);
       V2InUse |= (ZeroMask != V2Idx);
     }
   }
 
   if (V1InUse)
     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
                      DAG.getBitcast(MVT::v16i8, V1),
                      DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
   if (V2InUse)
     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
                      DAG.getBitcast(MVT::v16i8, V2),
                      DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
 
   // If we need shuffled inputs from both, blend the two.
   SDValue V;
   if (V1InUse && V2InUse)
     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
   else
     V = V1InUse ? V1 : V2;
 
   // Cast the result back to the correct type.
   return DAG.getBitcast(VT, V);
 }
 
 /// \brief Generic lowering of 8-lane i16 shuffles.
 ///
 /// This handles both single-input shuffles and combined shuffle/blends with
 /// two inputs. The single input shuffles are immediately delegated to
 /// a dedicated lowering routine.
 ///
 /// The blends are lowered in one of three fundamental ways. If there are few
 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
 /// of the input is significantly cheaper when lowered as an interleaving of
 /// the two inputs, try to interleave them. Otherwise, blend the low and high
 /// halves of the inputs separately (making them have relatively few inputs)
 /// and then concatenate them.
 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
 
   if (NumV2Inputs == 0) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Try to use shift instructions.
     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Shift;
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
       return V;
 
     // Use dedicated pack instructions for masks that match their pattern.
     if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
                                                DAG, Subtarget))
       return V;
 
     // Try to use byte rotation instructions.
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
                                                         Mask, Subtarget, DAG))
       return Rotate;
 
     // Make a copy of the mask so it can be modified.
     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
                                                      MutableMask, Subtarget,
                                                      DAG);
   }
 
   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
          "All single-input shuffles should be canonicalized to be V1-input "
          "shuffles.");
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
                                                 Zeroable, DAG))
       return V;
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Inputs == 1)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return V;
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
   bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
                                                    Zeroable, DAG))
     return Masked;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
     return V;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   if (SDValue BitBlend =
           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
     return BitBlend;
 
   // Try to lower by permuting the inputs into an unpack instruction.
   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
                                                             V2, Mask, DAG))
     return Unpack;
 
   // If we can't directly blend but can use PSHUFB, that will be better as it
   // can both shuffle and set up the inefficient blend.
   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
     bool V1InUse, V2InUse;
     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
                                               Zeroable, DAG, V1InUse, V2InUse);
   }
 
   // We can always bit-blend if we have to so the fallback strategy is to
   // decompose into single-input permutes and blends.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
                                                     Mask, DAG);
 }
 
 /// \brief Check whether a compaction lowering can be done by dropping even
 /// elements and compute how many times even elements must be dropped.
 ///
 /// This handles shuffles which take every Nth element where N is a power of
 /// two. Example shuffle masks:
 ///
 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
 ///
 /// Any of these lanes can of course be undef.
 ///
 /// This routine only supports N <= 3.
 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
 /// for larger N.
 ///
 /// \returns N above, or the number of times even elements must be dropped if
 /// there is such a number. Otherwise returns zero.
 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
                                           bool IsSingleInput) {
   // The modulus for the shuffle vector entries is based on whether this is
   // a single input or not.
   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
          "We should only be called with masks with a power-of-2 size!");
 
   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
 
   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
   // and 2^3 simultaneously. This is because we may have ambiguity with
   // partially undef inputs.
   bool ViableForN[3] = {true, true, true};
 
   for (int i = 0, e = Mask.size(); i < e; ++i) {
     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
     // want.
     if (Mask[i] < 0)
       continue;
 
     bool IsAnyViable = false;
     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
       if (ViableForN[j]) {
         uint64_t N = j + 1;
 
         // The shuffle mask must be equal to (i * 2^N) % M.
         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
           IsAnyViable = true;
         else
           ViableForN[j] = false;
       }
     // Early exit if we exhaust the possible powers of two.
     if (!IsAnyViable)
       break;
   }
 
   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
     if (ViableForN[j])
       return j + 1;
 
   // Return 0 as there is no viable power of two.
   return 0;
 }
 
 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
                                            ArrayRef<int> Mask, SDValue V1,
                                            SDValue V2, SelectionDAG &DAG) {
   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
 
   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
   if (V2.isUndef())
     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
 
   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
 }
 
 /// \brief Generic lowering of v16i8 shuffles.
 ///
 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
 /// detect any complexity reducing interleaving. If that doesn't help, it uses
 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
 /// back together.
 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use a zext lowering.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
                                                 Zeroable, DAG))
       return V;
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
 
   // For single-input shuffles, there are some nicer lowering tricks we can use.
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
     // Notably, this handles splat and partial-splat shuffles more efficiently.
     // However, it only makes sense if the pre-duplication shuffle simplifies
     // things significantly. Currently, this means we need to be able to
     // express the pre-duplication shuffle as an i16 shuffle.
     //
     // FIXME: We should check for other patterns which can be widened into an
     // i16 shuffle as well.
     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
       for (int i = 0; i < 16; i += 2)
         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
           return false;
 
       return true;
     };
     auto tryToWidenViaDuplication = [&]() -> SDValue {
       if (!canWidenViaDuplication(Mask))
         return SDValue();
       SmallVector<int, 4> LoInputs;
       copy_if(Mask, std::back_inserter(LoInputs),
               [](int M) { return M >= 0 && M < 8; });
       std::sort(LoInputs.begin(), LoInputs.end());
       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
                      LoInputs.end());
       SmallVector<int, 4> HiInputs;
       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
       std::sort(HiInputs.begin(), HiInputs.end());
       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
                      HiInputs.end());
 
       bool TargetLo = LoInputs.size() >= HiInputs.size();
       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
 
       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
       SmallDenseMap<int, int, 8> LaneMap;
       for (int I : InPlaceInputs) {
         PreDupI16Shuffle[I/2] = I/2;
         LaneMap[I] = I;
       }
       int j = TargetLo ? 0 : 4, je = j + 4;
       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
         // Check if j is already a shuffle of this input. This happens when
         // there are two adjacent bytes after we move the low one.
         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
           // If we haven't yet mapped the input, search for a slot into which
           // we can map it.
           while (j < je && PreDupI16Shuffle[j] >= 0)
             ++j;
 
           if (j == je)
             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
             return SDValue();
 
           // Map this input with the i16 shuffle.
           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
         }
 
         // Update the lane map based on the mapping we ended up with.
         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
       }
       V1 = DAG.getBitcast(
           MVT::v16i8,
           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
 
       // Unpack the bytes to form the i16s that will be shuffled into place.
       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
                        MVT::v16i8, V1, V1);
 
       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
       for (int i = 0; i < 16; ++i)
         if (Mask[i] >= 0) {
           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
           if (PostDupI16Shuffle[i / 2] < 0)
             PostDupI16Shuffle[i / 2] = MappedMask;
           else
             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
                    "Conflicting entries in the original shuffle!");
         }
       return DAG.getBitcast(
           MVT::v16i8,
           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
     };
     if (SDValue V = tryToWidenViaDuplication())
       return V;
   }
 
   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
                                                    Zeroable, DAG))
     return Masked;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
     return V;
 
   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   // with PSHUFB. It is important to do this before we attempt to generate any
   // blends but after all of the single-input lowerings. If the single input
   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
   // want to preserve that and we can DAG combine any longer sequences into
   // a PSHUFB in the end. But once we start blending from multiple inputs,
   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
   // and there are *very* few patterns that would actually be faster than the
   // PSHUFB approach because of its ability to zero lanes.
   //
   // FIXME: The only exceptions to the above are blends which are exact
   // interleavings with direct instructions supporting them. We currently don't
   // handle those well here.
   if (Subtarget.hasSSSE3()) {
     bool V1InUse = false;
     bool V2InUse = false;
 
     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
 
     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
     // do so. This avoids using them to handle blends-with-zero which is
     // important as a single pshufb is significantly faster for that.
     if (V1InUse && V2InUse) {
       if (Subtarget.hasSSE41())
         if (SDValue Blend = lowerVectorShuffleAsBlend(
                 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
           return Blend;
 
       // We can use an unpack to do the blending rather than an or in some
       // cases. Even though the or may be (very minorly) more efficient, we
       // preference this lowering because there are common cases where part of
       // the complexity of the shuffles goes away when we do the final blend as
       // an unpack.
       // FIXME: It might be worth trying to detect if the unpack-feeding
       // shuffles will both be pshufb, in which case we shouldn't bother with
       // this.
       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
               DL, MVT::v16i8, V1, V2, Mask, DAG))
         return Unpack;
 
       // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
       if (Subtarget.hasVBMI() && Subtarget.hasVLX())
         return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
     }
 
     return PSHUFB;
   }
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return V;
 
   if (SDValue BitBlend =
           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
     return BitBlend;
 
   // Check whether a compaction lowering can be done. This handles shuffles
   // which take every Nth element for some even N. See the helper function for
   // details.
   //
   // We special case these as they can be particularly efficiently handled with
   // the PACKUSB instruction on x86 and they show up in common patterns of
   // rearranging bytes to truncate wide elements.
   bool IsSingleInput = V2.isUndef();
   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
     // NumEvenDrops is the power of two stride of the elements. Another way of
     // thinking about it is that we need to drop the even elements this many
     // times to get the original input.
 
     // First we need to zero all the dropped bytes.
     assert(NumEvenDrops <= 3 &&
            "No support for dropping even elements more than 3 times.");
     // We use the mask type to pick which bytes are preserved based on how many
     // elements are dropped.
     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
     SDValue ByteClearMask = DAG.getBitcast(
         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
     if (!IsSingleInput)
       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
 
     // Now pack things back together.
     V1 = DAG.getBitcast(MVT::v8i16, V1);
     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
     for (int i = 1; i < NumEvenDrops; ++i) {
       Result = DAG.getBitcast(MVT::v8i16, Result);
       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
     }
 
     return Result;
   }
 
   // Handle multi-input cases by blending single-input shuffles.
   if (NumV2Elements > 0)
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
                                                       Mask, DAG);
 
   // The fallback path for single-input shuffles widens this into two v8i16
   // vectors with unpacks, shuffles those, and then pulls them back together
   // with a pack.
   SDValue V = V1;
 
   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
   for (int i = 0; i < 16; ++i)
     if (Mask[i] >= 0)
       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
 
   SDValue VLoHalf, VHiHalf;
   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
   // them out and avoid using UNPCK{L,H} to extract the elements of V as
   // i16s.
   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
     // Use a mask to drop the high bytes.
     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
 
     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
     VHiHalf = DAG.getUNDEF(MVT::v8i16);
 
     // Squash the masks to point directly into VLoHalf.
     for (int &M : LoBlendMask)
       if (M >= 0)
         M /= 2;
     for (int &M : HiBlendMask)
       if (M >= 0)
         M /= 2;
   } else {
     // Otherwise just unpack the low half of V into VLoHalf and the high half into
     // VHiHalf so that we can blend them as i16s.
     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
 
     VLoHalf = DAG.getBitcast(
         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
     VHiHalf = DAG.getBitcast(
         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   }
 
   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
 
   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
 }
 
 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
 ///
 /// This routine breaks down the specific type of 128-bit shuffle and
 /// dispatches to the lowering routines accordingly.
 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
                                         const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   switch (VT.SimpleTy) {
   case MVT::v2i64:
     return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v2f64:
     return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v4i32:
     return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v4f32:
     return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i16:
     return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i8:
     return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Unimplemented!");
   }
 }
 
 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
 ///
 /// This routine just extracts two subvectors, shuffles them independently, and
 /// then concatenates them back together. This should work effectively with all
 /// AVX vector shuffle types.
 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                           SDValue V2, ArrayRef<int> Mask,
                                           SelectionDAG &DAG) {
   assert(VT.getSizeInBits() >= 256 &&
          "Only for 256-bit or wider vector shuffles!");
   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
 
   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
 
   int NumElements = VT.getVectorNumElements();
   int SplitNumElements = NumElements / 2;
   MVT ScalarVT = VT.getVectorElementType();
   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
 
   // Rather than splitting build-vectors, just build two narrower build
   // vectors. This helps shuffling with splats and zeros.
   auto SplitVector = [&](SDValue V) {
     V = peekThroughBitcasts(V);
 
     MVT OrigVT = V.getSimpleValueType();
     int OrigNumElements = OrigVT.getVectorNumElements();
     int OrigSplitNumElements = OrigNumElements / 2;
     MVT OrigScalarVT = OrigVT.getVectorElementType();
     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
 
     SDValue LoV, HiV;
 
     auto *BV = dyn_cast<BuildVectorSDNode>(V);
     if (!BV) {
       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
                         DAG.getIntPtrConstant(0, DL));
       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
     } else {
 
       SmallVector<SDValue, 16> LoOps, HiOps;
       for (int i = 0; i < OrigSplitNumElements; ++i) {
         LoOps.push_back(BV->getOperand(i));
         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
       }
       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
     }
     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
                           DAG.getBitcast(SplitVT, HiV));
   };
 
   SDValue LoV1, HiV1, LoV2, HiV2;
   std::tie(LoV1, HiV1) = SplitVector(V1);
   std::tie(LoV2, HiV2) = SplitVector(V2);
 
   // Now create two 4-way blends of these half-width vectors.
   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
     for (int i = 0; i < SplitNumElements; ++i) {
       int M = HalfMask[i];
       if (M >= NumElements) {
         if (M >= NumElements + SplitNumElements)
           UseHiV2 = true;
         else
           UseLoV2 = true;
         V2BlendMask[i] = M - NumElements;
         BlendMask[i] = SplitNumElements + i;
       } else if (M >= 0) {
         if (M >= SplitNumElements)
           UseHiV1 = true;
         else
           UseLoV1 = true;
         V1BlendMask[i] = M;
         BlendMask[i] = i;
       }
     }
 
     // Because the lowering happens after all combining takes place, we need to
     // manually combine these blend masks as much as possible so that we create
     // a minimal number of high-level vector shuffle nodes.
 
     // First try just blending the halves of V1 or V2.
     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
       return DAG.getUNDEF(SplitVT);
     if (!UseLoV2 && !UseHiV2)
       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
     if (!UseLoV1 && !UseHiV1)
       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
 
     SDValue V1Blend, V2Blend;
     if (UseLoV1 && UseHiV1) {
       V1Blend =
         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
     } else {
       // We only use half of V1 so map the usage down into the final blend mask.
       V1Blend = UseLoV1 ? LoV1 : HiV1;
       for (int i = 0; i < SplitNumElements; ++i)
         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
     }
     if (UseLoV2 && UseHiV2) {
       V2Blend =
         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
     } else {
       // We only use half of V2 so map the usage down into the final blend mask.
       V2Blend = UseLoV2 ? LoV2 : HiV2;
       for (int i = 0; i < SplitNumElements; ++i)
         if (BlendMask[i] >= SplitNumElements)
           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
     }
     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
   };
   SDValue Lo = HalfBlend(LoMask);
   SDValue Hi = HalfBlend(HiMask);
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
 }
 
 /// \brief Either split a vector in halves or decompose the shuffles and the
 /// blend.
 ///
 /// This is provided as a good fallback for many lowerings of non-single-input
 /// shuffles with more than one 128-bit lane. In those cases, we want to select
 /// between splitting the shuffle into 128-bit components and stitching those
 /// back together vs. extracting the single-input shuffles and blending those
 /// results.
 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
                                                 SDValue V1, SDValue V2,
                                                 ArrayRef<int> Mask,
                                                 SelectionDAG &DAG) {
   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
          "shuffles as it could then recurse on itself.");
   int Size = Mask.size();
 
   // If this can be modeled as a broadcast of two elements followed by a blend,
   // prefer that lowering. This is especially important because broadcasts can
   // often fold with memory operands.
   auto DoBothBroadcast = [&] {
     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
     for (int M : Mask)
       if (M >= Size) {
         if (V2BroadcastIdx < 0)
           V2BroadcastIdx = M - Size;
         else if (M - Size != V2BroadcastIdx)
           return false;
       } else if (M >= 0) {
         if (V1BroadcastIdx < 0)
           V1BroadcastIdx = M;
         else if (M != V1BroadcastIdx)
           return false;
       }
     return true;
   };
   if (DoBothBroadcast())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
                                                       DAG);
 
   // If the inputs all stem from a single 128-bit lane of each input, then we
   // split them rather than blending because the split will decompose to
   // unusually few instructions.
   int LaneCount = VT.getSizeInBits() / 128;
   int LaneSize = Size / LaneCount;
   SmallBitVector LaneInputs[2];
   LaneInputs[0].resize(LaneCount, false);
   LaneInputs[1].resize(LaneCount, false);
   for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0)
       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
 
   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
   // that the decomposed single-input shuffles don't end up here.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
 }
 
 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
 /// a permutation and blend of those lanes.
 ///
 /// This essentially blends the out-of-lane inputs to each lane into the lane
 /// from a permuted copy of the vector. This lowering strategy results in four
 /// instructions in the worst case for a single-input cross lane shuffle which
 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
 /// of. Special cases for each particular shuffle pattern should be handled
 /// prior to trying this lowering.
 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
                                                        SDValue V1, SDValue V2,
                                                        ArrayRef<int> Mask,
                                                        SelectionDAG &DAG,
                                                        const X86Subtarget &Subtarget) {
   // FIXME: This should probably be generalized for 512-bit vectors as well.
   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
   int Size = Mask.size();
   int LaneSize = Size / 2;
 
   // If there are only inputs from one 128-bit lane, splitting will in fact be
   // less expensive. The flags track whether the given lane contains an element
   // that crosses to another lane.
   if (!Subtarget.hasAVX2()) {
     bool LaneCrossing[2] = {false, false};
     for (int i = 0; i < Size; ++i)
       if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
     if (!LaneCrossing[0] || !LaneCrossing[1])
       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   } else {
     bool LaneUsed[2] = {false, false};
     for (int i = 0; i < Size; ++i)
       if (Mask[i] >= 0)
         LaneUsed[(Mask[i] / LaneSize)] = true;
     if (!LaneUsed[0] || !LaneUsed[1])
       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   }
 
   assert(V2.isUndef() &&
          "This last part of this routine only works on single input shuffles");
 
   SmallVector<int, 32> FlippedBlendMask(Size);
   for (int i = 0; i < Size; ++i)
     FlippedBlendMask[i] =
         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
                                 ? Mask[i]
                                 : Mask[i] % LaneSize +
                                       (i / LaneSize) * LaneSize + Size);
 
   // Flip the vector, and blend the results which should now be in-lane.
   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
   SDValue Flipped = DAG.getBitcast(PVT, V1);
   Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
                                  { 2, 3, 0, 1 });
   Flipped = DAG.getBitcast(VT, Flipped);
   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
 }
 
 /// \brief Handle lowering 2-lane 128-bit shuffles.
 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                         SDValue V2, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
   if (Subtarget.hasAVX2() && V2.isUndef())
     return SDValue();
 
   SmallVector<int, 4> WidenedMask;
   if (!canWidenShuffleElements(Mask, WidenedMask))
     return SDValue();
 
   // TODO: If minimizing size and one of the inputs is a zero vector and the
   // the zero vector has only one use, we could use a VPERM2X128 to save the
   // instruction bytes needed to explicitly generate the zero vector.
 
   // Blends are faster and handle all the non-lane-crossing cases.
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   bool IsLowZero = (Zeroable & 0x3) == 0x3;
   bool IsHighZero = (Zeroable & 0xc) == 0xc;
 
   // If either input operand is a zero vector, use VPERM2X128 because its mask
   // allows us to replace the zero input with an implicit zero.
   if (!IsLowZero && !IsHighZero) {
     // Check for patterns which can be matched with a single insert of a 128-bit
     // subvector.
     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
 
       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
       // this will likely become vinsertf128 which can't fold a 256-bit memop.
       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
                                      VT.getVectorNumElements() / 2);
         SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                                   DAG.getIntPtrConstant(0, DL));
         SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
                                   OnlyUsesV1 ? V1 : V2,
                                   DAG.getIntPtrConstant(0, DL));
         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
       }
     }
 
     // Try to use SHUF128 if possible.
     if (Subtarget.hasVLX()) {
       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
                             ((WidenedMask[1] % 2) << 1);
       return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
                          DAG.getConstant(PermMask, DL, MVT::i8));
       }
     }
   }
 
   // Otherwise form a 128-bit permutation. After accounting for undefs,
   // convert the 64-bit shuffle mask selection values into 128-bit
   // selection bits by dividing the indexes by 2 and shifting into positions
   // defined by a vperm2*128 instruction's immediate control byte.
 
   // The immediate permute control byte looks like this:
   //    [1:0] - select 128 bits from sources for low half of destination
   //    [2]   - ignore
   //    [3]   - zero low half of destination
   //    [5:4] - select 128 bits from sources for high half of destination
   //    [6]   - ignore
   //    [7]   - zero high half of destination
 
   assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
 
   unsigned PermMask = 0;
   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
 
   // Check the immediate mask and replace unused sources with undef.
   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
     V1 = DAG.getUNDEF(VT);
   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
     V2 = DAG.getUNDEF(VT);
 
   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
                      DAG.getConstant(PermMask, DL, MVT::i8));
 }
 
 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
 /// shuffling each lane.
 ///
 /// This will only succeed when the result of fixing the 128-bit lanes results
 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
 /// each 128-bit lanes. This handles many cases where we can quickly blend away
 /// the lane crosses early and then use simpler shuffles within each lane.
 ///
 /// FIXME: It might be worthwhile at some point to support this without
 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
 /// in x86 only floating point has interesting non-repeating shuffles, and even
 /// those are still *marginally* more expensive.
 static SDValue lowerVectorShuffleByMerging128BitLanes(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
 
   int Size = Mask.size();
   int LaneSize = 128 / VT.getScalarSizeInBits();
   int NumLanes = Size / LaneSize;
   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
 
   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
   // check whether the in-128-bit lane shuffles share a repeating pattern.
   SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
   SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
   for (int i = 0; i < Size; ++i) {
     if (Mask[i] < 0)
       continue;
 
     int j = i / LaneSize;
 
     if (Lanes[j] < 0) {
       // First entry we've seen for this lane.
       Lanes[j] = Mask[i] / LaneSize;
     } else if (Lanes[j] != Mask[i] / LaneSize) {
       // This doesn't match the lane selected previously!
       return SDValue();
     }
 
     // Check that within each lane we have a consistent shuffle mask.
     int k = i % LaneSize;
     if (InLaneMask[k] < 0) {
       InLaneMask[k] = Mask[i] % LaneSize;
     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
       // This doesn't fit a repeating in-lane mask.
       return SDValue();
     }
   }
 
   // First shuffle the lanes into place.
   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
                                 VT.getSizeInBits() / 64);
   SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
   for (int i = 0; i < NumLanes; ++i)
     if (Lanes[i] >= 0) {
       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
     }
 
   V1 = DAG.getBitcast(LaneVT, V1);
   V2 = DAG.getBitcast(LaneVT, V2);
   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
 
   // Cast it back to the type we actually want.
   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
 
   // Now do a simple shuffle that isn't lane crossing.
   SmallVector<int, 8> NewMask((unsigned)Size, -1);
   for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0)
       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
          "Must not introduce lane crosses at this point!");
 
   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
 }
 
 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
 /// This allows for fast cases such as subvector extraction/insertion
 /// or shuffling smaller vector types which can lower more efficiently.
 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
                                                SDValue V1, SDValue V2,
                                                ArrayRef<int> Mask,
                                                const X86Subtarget &Subtarget,
                                                SelectionDAG &DAG) {
   assert((VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected 256-bit or 512-bit vector");
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned HalfNumElts = NumElts / 2;
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
 
   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
   if (!UndefLower && !UndefUpper)
     return SDValue();
 
   // Upper half is undef and lower half is whole upper subvector.
   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   if (UndefUpper &&
       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
                              DAG.getIntPtrConstant(HalfNumElts, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   // Lower half is undef and upper half is whole lower subvector.
   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   if (UndefLower &&
       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
                              DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
                        DAG.getIntPtrConstant(HalfNumElts, DL));
   }
 
   // If the shuffle only uses two of the four halves of the input operands,
   // then extract them and perform the 'half' shuffle at half width.
   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
   int HalfIdx1 = -1, HalfIdx2 = -1;
   SmallVector<int, 8> HalfMask(HalfNumElts);
   unsigned Offset = UndefLower ? HalfNumElts : 0;
   for (unsigned i = 0; i != HalfNumElts; ++i) {
     int M = Mask[i + Offset];
     if (M < 0) {
       HalfMask[i] = M;
       continue;
     }
 
     // Determine which of the 4 half vectors this element is from.
     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
     int HalfIdx = M / HalfNumElts;
 
     // Determine the element index into its half vector source.
     int HalfElt = M % HalfNumElts;
 
     // We can shuffle with up to 2 half vectors, set the new 'half'
     // shuffle mask accordingly.
     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
       HalfMask[i] = HalfElt;
       HalfIdx1 = HalfIdx;
       continue;
     }
     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
       HalfMask[i] = HalfElt + HalfNumElts;
       HalfIdx2 = HalfIdx;
       continue;
     }
 
     // Too many half vectors referenced.
     return SDValue();
   }
   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
 
   // Only shuffle the halves of the inputs when useful.
   int NumLowerHalves =
       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
   int NumUpperHalves =
       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
 
   // uuuuXXXX - don't extract uppers just to insert again.
   if (UndefLower && NumUpperHalves != 0)
     return SDValue();
 
   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
   if (UndefUpper && NumUpperHalves == 2)
     return SDValue();
 
   // AVX2 - XXXXuuuu - always extract lowers.
   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
     if (VT == MVT::v4f64 || VT == MVT::v4i64)
       return SDValue();
     // AVX2 supports variable 32-bit element cross-lane shuffles.
     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
       // XXXXuuuu - don't extract lowers and uppers.
       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
         return SDValue();
     }
   }
 
   // AVX512 - XXXXuuuu - always extract lowers.
   if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
     return SDValue();
 
   auto GetHalfVector = [&](int HalfIdx) {
     if (HalfIdx < 0)
       return DAG.getUNDEF(HalfVT);
     SDValue V = (HalfIdx < 2 ? V1 : V2);
     HalfIdx = (HalfIdx % 2) * HalfNumElts;
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
                        DAG.getIntPtrConstant(HalfIdx, DL));
   };
 
   SDValue Half1 = GetHalfVector(HalfIdx1);
   SDValue Half2 = GetHalfVector(HalfIdx2);
   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
                      DAG.getIntPtrConstant(Offset, DL));
 }
 
 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
 /// given mask.
 ///
 /// This returns true if the elements from a particular input are already in the
 /// slot required by the given mask and require no permutation.
 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
       return false;
 
   return true;
 }
 
 /// Handle case where shuffle sources are coming from the same 128-bit lane and
 /// every lane can be represented as the same repeating mask - allowing us to
 /// shuffle the sources with the repeating shuffle and then permute the result
 /// to the destination lanes.
 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   int NumElts = VT.getVectorNumElements();
   int NumLanes = VT.getSizeInBits() / 128;
   int NumLaneElts = NumElts / NumLanes;
 
   // On AVX2 we may be able to just shuffle the lowest elements and then
   // broadcast the result.
   if (Subtarget.hasAVX2()) {
     for (unsigned BroadcastSize : {16, 32, 64}) {
       if (BroadcastSize <= VT.getScalarSizeInBits())
         continue;
       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
 
       // Attempt to match a repeating pattern every NumBroadcastElts,
       // accounting for UNDEFs but only references the lowest 128-bit
       // lane of the inputs.
       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
         for (int i = 0; i != NumElts; i += NumBroadcastElts)
           for (int j = 0; j != NumBroadcastElts; ++j) {
             int M = Mask[i + j];
             if (M < 0)
               continue;
             int &R = RepeatMask[j];
             if (0 != ((M % NumElts) / NumLaneElts))
               return false;
             if (0 <= R && R != M)
               return false;
             R = M;
           }
         return true;
       };
 
       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
       if (!FindRepeatingBroadcastMask(RepeatMask))
         continue;
 
       // Shuffle the (lowest) repeated elements in place for broadcast.
       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
 
       // Shuffle the actual broadcast.
       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
       for (int i = 0; i != NumElts; i += NumBroadcastElts)
         for (int j = 0; j != NumBroadcastElts; ++j)
           BroadcastMask[i + j] = j;
       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
                                   BroadcastMask);
     }
   }
 
   // Bail if the shuffle mask doesn't cross 128-bit lanes.
   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
     return SDValue();
 
   // Bail if we already have a repeated lane shuffle mask.
   SmallVector<int, 8> RepeatedShuffleMask;
   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
     return SDValue();
 
   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
   int NumSubLanes = NumLanes * SubLaneScale;
   int NumSubLaneElts = NumLaneElts / SubLaneScale;
 
   // Check that all the sources are coming from the same lane and see if we can
   // form a repeating shuffle mask (local to each sub-lane). At the same time,
   // determine the source sub-lane for each destination sub-lane.
   int TopSrcSubLane = -1;
   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
 
   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
     // Extract the sub-lane mask, check that it all comes from the same lane
     // and normalize the mask entries to come from the first lane.
     int SrcLane = -1;
     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
       if (M < 0)
         continue;
       int Lane = (M % NumElts) / NumLaneElts;
       if ((0 <= SrcLane) && (SrcLane != Lane))
         return SDValue();
       SrcLane = Lane;
       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
       SubLaneMask[Elt] = LocalM;
     }
 
     // Whole sub-lane is UNDEF.
     if (SrcLane < 0)
       continue;
 
     // Attempt to match against the candidate repeated sub-lane masks.
     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
         for (int i = 0; i != NumSubLaneElts; ++i) {
           if (M1[i] < 0 || M2[i] < 0)
             continue;
           if (M1[i] != M2[i])
             return false;
         }
         return true;
       };
 
       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
         continue;
 
       // Merge the sub-lane mask into the matching repeated sub-lane mask.
       for (int i = 0; i != NumSubLaneElts; ++i) {
         int M = SubLaneMask[i];
         if (M < 0)
           continue;
         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
                "Unexpected mask element");
         RepeatedSubLaneMask[i] = M;
       }
 
       // Track the top most source sub-lane - by setting the remaining to UNDEF
       // we can greatly simplify shuffle matching.
       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
       break;
     }
 
     // Bail if we failed to find a matching repeated sub-lane mask.
     if (Dst2SrcSubLanes[DstSubLane] < 0)
       return SDValue();
   }
   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
          "Unexpected source lane");
 
   // Create a repeating shuffle mask for the entire vector.
   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
     int Lane = SubLane / SubLaneScale;
     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
       int M = RepeatedSubLaneMask[Elt];
       if (M < 0)
         continue;
       int Idx = (SubLane * NumSubLaneElts) + Elt;
       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
     }
   }
   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
 
   // Shuffle each source sub-lane to its destination.
   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
     if (SrcSubLane < 0)
       continue;
     for (int j = 0; j != NumSubLaneElts; ++j)
       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
   }
 
   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
                               SubLaneMask);
 }
 
 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
                                          unsigned &ShuffleImm,
                                          ArrayRef<int> Mask) {
   int NumElts = VT.getVectorNumElements();
   assert(VT.getScalarSizeInBits() == 64 &&
          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected data type for VSHUFPD");
 
   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
   ShuffleImm = 0;
   bool ShufpdMask = true;
   bool CommutableMask = true;
   for (int i = 0; i < NumElts; ++i) {
     if (Mask[i] == SM_SentinelUndef)
       continue;
     if (Mask[i] < 0)
       return false;
     int Val = (i & 6) + NumElts * (i & 1);
     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
     if (Mask[i] < Val || Mask[i] > Val + 1)
       ShufpdMask = false;
     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
       CommutableMask = false;
     ShuffleImm |= (Mask[i] % 2) << i;
   }
 
   if (ShufpdMask)
     return true;
   if (CommutableMask) {
     std::swap(V1, V2);
     return true;
   }
 
   return false;
 }
 
 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
          "Unexpected data type for VSHUFPD");
 
   unsigned Immediate = 0;
   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
     return SDValue();
 
   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
                      DAG.getConstant(Immediate, DL, MVT::i8));
 }
 
 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
                                            Zeroable, Subtarget, DAG))
     return V;
 
   if (V2.isUndef()) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Use low duplicate instructions for masks that match their pattern.
     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
 
     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
       // Non-half-crossing single input shuffles can be lowered with an
       // interleaved permutation.
       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
     }
 
     // With AVX2 we have direct support for this permutation.
     if (Subtarget.hasAVX2())
       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
 
     // Try to create an in-lane repeating shuffle mask and then shuffle the
     // the results into the target lanes.
     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return V;
 
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                    DAG, Subtarget);
   }
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Check if the blend happens to exactly fit that of SHUFPD.
   if (SDValue Op =
       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
     return Op;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // the results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
   // instruction so skip this pattern.
   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
                                 isShuffleMaskInputInPlace(1, Mask))))
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
                                                       Mask, DAG);
 
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
 }
 
 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v4i64 shuffling..
 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
 
   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
                                            Zeroable, Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   if (V2.isUndef()) {
     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
     // can use lower latency instructions that will operate on both lanes.
     SmallVector<int, 2> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
       SmallVector<int, 4> PSHUFDMask;
       scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v4i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
                       DAG.getBitcast(MVT::v8i32, V1),
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
     }
 
     // AVX2 provides a direct instruction for permuting a single input across
     // lanes.
     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   }
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // If we have VLX support, we can use VALIGN or VEXPAND.
   if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
   }
 
   // Try to use PALIGNR.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
                                                       Mask, Subtarget, DAG))
     return Rotate;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
     return V;
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
   // instruction so skip this pattern.
   if (!isShuffleMaskInputInPlace(0, Mask) &&
       !isShuffleMaskInputInPlace(1, Mask))
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
       return Result;
 
   // Otherwise fall back on generic blend lowering.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
                                                     Mask, DAG);
 }
 
 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   // options to efficiently lower the shuffle.
   SmallVector<int, 4> RepeatedMask;
   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
     assert(RepeatedMask.size() == 4 &&
            "Repeated masks must be half the mask width!");
 
     // Use even/odd duplicate instructions for masks that match their pattern.
     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
 
     if (V2.isUndef())
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
       return V;
 
     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
     // have already handled any direct blends.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
   }
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // the results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // If we have a single input shuffle with different shuffle patterns in the
   // two 128-bit lanes use the variable mask to VPERMILPS.
   if (V2.isUndef()) {
     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
 
     if (Subtarget.hasAVX2())
       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
 
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
                                                    DAG, Subtarget);
   }
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return Result;
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
 
   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
   // since after split we get a more efficient code using vpunpcklwd and
   // vpunpckhwd instrs than vblend.
   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
     if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
                                                      Mask, DAG))
       return V;
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
                                                       Mask, DAG);
 
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
 }
 
 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v8i32 shuffling..
 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
   // since after split we get a more efficient code than vblend by using
   // vpunpcklwd and vpunpckhwd instrs.
   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
       !Subtarget.hasAVX512())
     if (SDValue V =
             lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
       return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   // If the shuffle mask is repeated in each 128-bit lane we can use more
   // efficient instructions that mirror the shuffles across the two 128-bit
   // lanes.
   SmallVector<int, 4> RepeatedMask;
   bool Is128BitLaneRepeatedShuffle =
       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
   if (Is128BitLaneRepeatedShuffle) {
     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
     if (V2.isUndef())
       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
       return V;
   }
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // If we have VLX support, we can use VALIGN or EXPAND.
   if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
   }
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // If the shuffle patterns aren't repeated but it is a single input, directly
   // generate a cross-lane VPERMD instruction.
   if (V2.isUndef()) {
     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
   }
 
   // Assume that a single SHUFPS is faster than an alternative sequence of
   // multiple instructions (even if the CPU has a domain penalty).
   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
                                                   CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v8i32, ShufPS);
   }
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
   // Otherwise fall back on generic blend lowering.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
                                                     Mask, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v16i16 shuffling..
 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
     return V;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // the results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   if (V2.isUndef()) {
     // There are no generalized cross-lane shuffle operations available on i16
     // element types.
     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
                                                      Mask, DAG, Subtarget);
 
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
       // As this is a single-input shuffle, the repeated mask should be
       // a strictly valid v8i16 mask that we can pass through to the v8i16
       // lowering to handle even the v16 case.
       return lowerV8I16GeneralSingleInputVectorShuffle(
           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
     }
   }
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
   // AVX512BWVL can lower to VPERMW.
   if (Subtarget.hasBWI() && Subtarget.hasVLX())
     return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v32i8 shuffling..
 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
     return V;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // the results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // There are no generalized cross-lane shuffle operations available on i8
   // element types.
   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
                                                    DAG, Subtarget);
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
   // AVX512VBMIVL can lower to VPERMB.
   if (Subtarget.hasVBMI() && Subtarget.hasVLX())
     return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
 }
 
 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
 ///
 /// This routine either breaks down the specific type of a 256-bit x86 vector
 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
 /// together based on the available instructions.
 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
                                         const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
   int NumElts = VT.getVectorNumElements();
   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
 
   if (NumV2Elements == 1 && Mask[0] >= NumElts)
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return Insertion;
 
   // Handle special cases where the lower or upper half is UNDEF.
   if (SDValue V =
           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
   // can check for those subtargets here and avoid much of the subtarget
   // querying in the per-vector-type lowering routines. With AVX1 we have
   // essentially *zero* ability to manipulate a 256-bit vector with integer
   // types. Since we'll use floating point types there eventually, just
   // immediately cast everything to a float and operate entirely in that domain.
   if (VT.isInteger() && !Subtarget.hasAVX2()) {
     int ElementBits = VT.getScalarSizeInBits();
     if (ElementBits < 32) {
       // No floating point type available, if we can't use the bit operations
       // for masking/blending then decompose into 128-bit vectors.
       if (SDValue V =
               lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
         return V;
       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
         return V;
       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
     }
 
     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
                                 VT.getVectorNumElements());
     V1 = DAG.getBitcast(FpVT, V1);
     V2 = DAG.getBitcast(FpVT, V2);
     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   }
 
   switch (VT.SimpleTy) {
   case MVT::v4f64:
     return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v4i64:
     return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8f32:
     return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i32:
     return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i16:
     return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v32i8:
     return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Not a valid 256-bit x86 vector type!");
   }
 }
 
 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
                                         ArrayRef<int> Mask, SDValue V1,
                                         SDValue V2, SelectionDAG &DAG) {
   assert(VT.getScalarSizeInBits() == 64 &&
          "Unexpected element type size for 128bit shuffle.");
 
   // To handle 256 bit vector requires VLX and most probably
   // function lowerV2X128VectorShuffle() is better solution.
   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
 
   SmallVector<int, 4> WidenedMask;
   if (!canWidenShuffleElements(Mask, WidenedMask))
     return SDValue();
 
   // Check for patterns which can be matched with a single insert of a 256-bit
   // subvector.
   bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
                                         {0, 1, 2, 3, 0, 1, 2, 3});
   if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
                                         {0, 1, 2, 3, 8, 9, 10, 11})) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                               DAG.getIntPtrConstant(0, DL));
     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
                               OnlyUsesV1 ? V1 : V2,
                               DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
   }
 
   assert(WidenedMask.size() == 4);
 
   // See if this is an insertion of the lower 128-bits of V2 into V1.
   bool IsInsert = true;
   int V2Index = -1;
   for (int i = 0; i < 4; ++i) {
     assert(WidenedMask[i] >= -1);
     if (WidenedMask[i] < 0)
       continue;
 
     // Make sure all V1 subvectors are in place.
     if (WidenedMask[i] < 4) {
       if (WidenedMask[i] != i) {
         IsInsert = false;
         break;
       }
     } else {
       // Make sure we only have a single V2 index and its the lowest 128-bits.
       if (V2Index >= 0 || WidenedMask[i] != 4) {
         IsInsert = false;
         break;
       }
       V2Index = i;
     }
   }
   if (IsInsert && V2Index >= 0) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
                                  DAG.getIntPtrConstant(0, DL));
     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
   }
 
   // Try to lower to to vshuf64x2/vshuf32x4.
   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
   unsigned PermMask = 0;
   // Insure elements came from the same Op.
   for (int i = 0; i < 4; ++i) {
     assert(WidenedMask[i] >= -1);
     if (WidenedMask[i] < 0)
       continue;
 
     SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
     unsigned OpIndex = i / 2;
     if (Ops[OpIndex].isUndef())
       Ops[OpIndex] = Op;
     else if (Ops[OpIndex] != Op)
       return SDValue();
 
     // Convert the 128-bit shuffle mask selection values into 128-bit selection
     // bits defined by a vshuf64x2 instruction's immediate control byte.
     PermMask |= (WidenedMask[i] % 4) << (i * 2);
   }
 
   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
                      DAG.getConstant(PermMask, DL, MVT::i8));
 }
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   if (V2.isUndef()) {
     // Use low duplicate instructions for masks that match their pattern.
     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
 
     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
       // Non-half-crossing single input shuffles can be lowered with an
       // interleaved permutation.
       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
     }
 
     SmallVector<int, 4> RepeatedMask;
     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   }
 
   if (SDValue Shuf128 =
           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Shuf128;
 
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Unpck;
 
   // Check if the blend happens to exactly fit that of SHUFPD.
   if (SDValue Op =
       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Op;
 
   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
                                              V2, DAG, Subtarget))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   // options to efficiently lower the shuffle.
   SmallVector<int, 4> RepeatedMask;
   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
 
     // Use even/odd duplicate instructions for masks that match their pattern.
     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
 
     if (V2.isUndef())
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue Unpck =
             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
       return Unpck;
 
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
     // Otherwise, fall back to a SHUFPS sequence.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
 
   // If we have a single input shuffle with different shuffle patterns in the
   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
   if (V2.isUndef() &&
       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
   }
 
   // If we have AVX512F support, we can use VEXPAND.
   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
                                              V1, V2, DAG, Subtarget))
     return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   if (V2.isUndef()) {
     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
     // can use lower latency instructions that will operate on all four
     // 128-bit lanes.
     SmallVector<int, 2> Repeated128Mask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
       SmallVector<int, 4> PSHUFDMask;
       scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v8i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
                       DAG.getBitcast(MVT::v16i32, V1),
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
     }
 
     SmallVector<int, 4> Repeated256Mask;
     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
   }
 
   if (SDValue Shuf128 =
           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Shuf128;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use VALIGN.
   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
                                                   Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to use PALIGNR.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
                                                       Mask, Subtarget, DAG))
     return Rotate;
 
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Unpck;
   // If we have AVX512F support, we can use VEXPAND.
   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
                                              V2, DAG, Subtarget))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // If the shuffle mask is repeated in each 128-bit lane we can use more
   // efficient instructions that mirror the shuffles across the four 128-bit
   // lanes.
   SmallVector<int, 4> RepeatedMask;
   bool Is128BitLaneRepeatedShuffle =
       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
   if (Is128BitLaneRepeatedShuffle) {
     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
     if (V2.isUndef())
       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
       return V;
   }
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use VALIGN.
   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
                                                   Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to use byte rotation instructions.
   if (Subtarget.hasBWI())
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
 
   // Assume that a single SHUFPS is faster than using a permv shuffle.
   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
                                                   CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v16i32, ShufPS);
   }
   // If we have AVX512F support, we can use VEXPAND.
   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
                                              V1, V2, DAG, Subtarget))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
     return V;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   if (V2.isUndef()) {
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
       // As this is a single-input shuffle, the repeated mask should be
       // a strictly valid v8i16 mask that we can pass through to the v8i16
       // lowering to handle even the v32 case.
       return lowerV8I16GeneralSingleInputVectorShuffle(
           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
     }
   }
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
     return V;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
   // VBMI can use VPERMV/VPERMV3 byte shuffles.
   if (Subtarget.hasVBMI())
     return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // the results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
 }
 
 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
 ///
 /// This routine either breaks down the specific type of a 512-bit x86 vector
 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
 /// together based on the available instructions.
 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
                                         const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
          "Cannot lower 512-bit vectors w/ basic ISA!");
 
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
   int NumElts = Mask.size();
   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
 
   if (NumV2Elements == 1 && Mask[0] >= NumElts)
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return Insertion;
 
   // Handle special cases where the lower or upper half is UNDEF.
   if (SDValue V =
         lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast =
           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   // Dispatch to each element type for lowering. If we don't have support for
   // specific element type shuffles at 512 bits, immediately split them and
   // lower them. Each lowering routine of a given type is allowed to assume that
   // the requisite ISA extensions for that element type are available.
   switch (VT.SimpleTy) {
   case MVT::v8f64:
     return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16f32:
     return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i64:
     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i32:
     return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v32i16:
     return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v64i8:
     return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Not a valid 512-bit x86 vector type!");
   }
 }
 
 // Lower vXi1 vector shuffles.
 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
 // vector, shuffle and then truncate it back.
 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                       MVT VT, SDValue V1, SDValue V2,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
          "Cannot lower 512-bit vectors w/o basic ISA!");
   MVT ExtVT;
   switch (VT.SimpleTy) {
   default:
     llvm_unreachable("Expected a vector of i1 elements");
   case MVT::v2i1:
     ExtVT = MVT::v2i64;
     break;
   case MVT::v4i1:
     ExtVT = MVT::v4i32;
     break;
   case MVT::v8i1:
     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
     // shuffle.
     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
     break;
   case MVT::v16i1:
     ExtVT = MVT::v16i32;
     break;
   case MVT::v32i1:
     ExtVT = MVT::v32i16;
     break;
   case MVT::v64i1:
     ExtVT = MVT::v64i8;
     break;
   }
 
   if (ISD::isBuildVectorAllZeros(V1.getNode()))
     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
     V1 = getOnesVector(ExtVT, DAG, DL);
   else
     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
 
   if (V2.isUndef())
     V2 = DAG.getUNDEF(ExtVT);
   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
     V2 = getOnesVector(ExtVT, DAG, DL);
   else
     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
 
   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
   // i1 was sign extended we can use X86ISD::CVT2MASK.
   int NumElems = VT.getVectorNumElements();
   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
       (Subtarget.hasDQI() && (NumElems < 32)))
     return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
 
   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
 }
 
 /// Helper function that returns true if the shuffle mask should be
 /// commuted to improve canonicalization.
 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
   int NumElements = Mask.size();
 
   int NumV1Elements = 0, NumV2Elements = 0;
   for (int M : Mask)
     if (M < 0)
       continue;
     else if (M < NumElements)
       ++NumV1Elements;
     else
       ++NumV2Elements;
 
   // Commute the shuffle as needed such that more elements come from V1 than
   // V2. This allows us to match the shuffle pattern strictly on how many
   // elements come from V1 without handling the symmetric cases.
   if (NumV2Elements > NumV1Elements)
     return true;
 
   assert(NumV1Elements > 0 && "No V1 indices");
 
   if (NumV2Elements == 0)
     return false;
 
   // When the number of V1 and V2 elements are the same, try to minimize the
   // number of uses of V2 in the low half of the vector. When that is tied,
   // ensure that the sum of indices for V1 is equal to or lower than the sum
   // indices for V2. When those are equal, try to ensure that the number of odd
   // indices for V1 is lower than the number of odd indices for V2.
   if (NumV1Elements == NumV2Elements) {
     int LowV1Elements = 0, LowV2Elements = 0;
     for (int M : Mask.slice(0, NumElements / 2))
       if (M >= NumElements)
         ++LowV2Elements;
       else if (M >= 0)
         ++LowV1Elements;
     if (LowV2Elements > LowV1Elements)
       return true;
     if (LowV2Elements == LowV1Elements) {
       int SumV1Indices = 0, SumV2Indices = 0;
       for (int i = 0, Size = Mask.size(); i < Size; ++i)
         if (Mask[i] >= NumElements)
           SumV2Indices += i;
         else if (Mask[i] >= 0)
           SumV1Indices += i;
       if (SumV2Indices < SumV1Indices)
         return true;
       if (SumV2Indices == SumV1Indices) {
         int NumV1OddIndices = 0, NumV2OddIndices = 0;
         for (int i = 0, Size = Mask.size(); i < Size; ++i)
           if (Mask[i] >= NumElements)
             NumV2OddIndices += i % 2;
           else if (Mask[i] >= 0)
             NumV1OddIndices += i % 2;
         if (NumV2OddIndices < NumV1OddIndices)
           return true;
       }
     }
   }
 
   return false;
 }
 
 /// \brief Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
 /// vector shuffles. Most of the specific lowering strategies are encapsulated
 /// above in helper routines. The canonicalization attempts to widen shuffles
 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
 /// s.t. only one of the two inputs needs to be tested, etc.
 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   ArrayRef<int> Mask = SVOp->getMask();
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   MVT VT = Op.getSimpleValueType();
   int NumElements = VT.getVectorNumElements();
   SDLoc DL(Op);
   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
 
   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
          "Can't lower MMX shuffles");
 
   bool V1IsUndef = V1.isUndef();
   bool V2IsUndef = V2.isUndef();
   if (V1IsUndef && V2IsUndef)
     return DAG.getUNDEF(VT);
 
   // When we create a shuffle node we put the UNDEF node to second operand,
   // but in some cases the first operand may be transformed to UNDEF.
   // In this case we should just commute the node.
   if (V1IsUndef)
     return DAG.getCommutedVectorShuffle(*SVOp);
 
   // Check for non-undef masks pointing at an undef vector and make the masks
   // undef as well. This makes it easier to match the shuffle based solely on
   // the mask.
   if (V2IsUndef)
     for (int M : Mask)
       if (M >= NumElements) {
         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
         for (int &M : NewMask)
           if (M >= NumElements)
             M = -1;
         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
       }
 
   // Check for illegal shuffle mask element index values.
   int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
   assert(llvm::all_of(Mask,
                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
          "Out of bounds shuffle index");
 
   // We actually see shuffles that are entirely re-arrangements of a set of
   // zero inputs. This mostly happens while decomposing complex shuffles into
   // simple ones. Directly lower these as a buildvector of zeros.
   APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   if (Zeroable.isAllOnesValue())
     return getZeroVector(VT, Subtarget, DAG, DL);
 
   // Try to collapse shuffles into using a vector type with fewer elements but
   // wider element types. We cap this to not form integers or floating point
   // elements wider than 64 bits, but it might be interesting to form i128
   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
   SmallVector<int, 16> WidenedMask;
   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
       canWidenShuffleElements(Mask, WidenedMask)) {
     MVT NewEltVT = VT.isFloatingPoint()
                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
     // Make sure that the new vector type is legal. For example, v2f64 isn't
     // legal on SSE1.
     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
       V1 = DAG.getBitcast(NewVT, V1);
       V2 = DAG.getBitcast(NewVT, V2);
       return DAG.getBitcast(
           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
     }
   }
 
   // Commute the shuffle if it will improve canonicalization.
   if (canonicalizeShuffleMaskWithCommute(Mask))
     return DAG.getCommutedVectorShuffle(*SVOp);
 
   // For each vector width, delegate to a specialized lowering routine.
   if (VT.is128BitVector())
     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
                                     DAG);
 
   if (VT.is256BitVector())
     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
                                     DAG);
 
   if (VT.is512BitVector())
     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
                                     DAG);
 
   if (Is1BitVector)
     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
 
   llvm_unreachable("Unimplemented!");
 }
 
 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   SDValue Cond = Op.getOperand(0);
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
   auto *CondBV = cast<BuildVectorSDNode>(Cond);
 
   // Only non-legal VSELECTs reach this lowering, convert those into generic
   // shuffles and re-use the shuffle lowering path for blends.
   SmallVector<int, 32> Mask;
   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
     SDValue CondElt = CondBV->getOperand(i);
     Mask.push_back(
         isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
                                      : -1);
   }
   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
 }
 
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   // A vselect where all conditions and data are constants can be optimized into
   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
     return SDValue();
 
   // Try to lower this to a blend-style vector shuffle. This can handle all
   // constant condition cases.
   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
     return BlendOp;
 
   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
   // with patterns on the mask registers on AVX-512.
   if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
     return Op;
 
   // Variable blends are only legal from SSE4.1 onward.
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
   // into an i1 condition so that we can use the mask-based 512-bit blend
   // instructions.
   if (VT.getSizeInBits() == 512) {
     SDValue Cond = Op.getOperand(0);
     // The vNi1 condition case should be handled above as it can be trivially
     // lowered.
     assert(Cond.getValueType().getScalarSizeInBits() ==
                VT.getScalarSizeInBits() &&
            "Should have a size-matched integer condition!");
     // Build a mask by testing the condition against itself (tests for zero).
     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
     SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
     // Now return a new VSELECT using the mask.
     return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
   }
 
   // Only some types will be legal on some subtargets. If we can emit a legal
   // VSELECT-matching blend, return Op, and but if we need to expand, return
   // a null value.
   switch (VT.SimpleTy) {
   default:
     // Most of the vector types have blends past SSE4.1.
     return Op;
 
   case MVT::v32i8:
     // The byte blends for AVX vectors were introduced only in AVX2.
     if (Subtarget.hasAVX2())
       return Op;
 
     return SDValue();
 
   case MVT::v8i16:
   case MVT::v16i16:
     // FIXME: We should custom lower this by fixing the condition and using i8
     // blends.
     return SDValue();
   }
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
                                   Op.getOperand(0), Op.getOperand(1));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   }
 
   if (VT == MVT::f32) {
     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
     // the result back to FR32 register. It's only worth matching if the
     // result has a single use which is a store or a bitcast to i32.  And in
     // the case of a store, it's not worth it if the index is a constant 0,
     // because a MOVSSmr can be used instead, which is smaller and faster.
     if (!Op.hasOneUse())
       return SDValue();
     SDNode *User = *Op.getNode()->use_begin();
     if ((User->getOpcode() != ISD::STORE ||
          isNullConstant(Op.getOperand(1))) &&
         (User->getOpcode() != ISD::BITCAST ||
          User->getValueType(0) != MVT::i32))
       return SDValue();
     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
                                   Op.getOperand(1));
     return DAG.getBitcast(MVT::f32, Extract);
   }
 
   if (VT == MVT::i32 || VT == MVT::i64) {
     // ExtractPS/pextrq works with constant index.
     if (isa<ConstantSDNode>(Op.getOperand(1)))
       return Op;
   }
 
   return SDValue();
 }
 
 /// Extract one bit from mask vector, like v16i1 or v8i1.
 /// AVX-512 feature.
 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   SDValue Vec = Op.getOperand(0);
   SDLoc dl(Vec);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
   MVT EltVT = Op.getSimpleValueType();
 
   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
          "Unexpected vector type in ExtractBitFromMaskVector");
 
   // variable index can't be handled in mask registers,
   // extend vector to VR512/128
   if (!isa<ConstantSDNode>(Idx)) {
     unsigned NumElts = VecVT.getVectorNumElements();
     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
     // than extending to 128/256bit.
     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   }
 
   // Canonicalize result type to MVT::i32.
   if (EltVT != MVT::i32) {
     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                   Vec, Idx);
     return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
   }
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
   // Extracts from element 0 are always allowed.
   if (IdxVal == 0)
     return Op;
 
   // If the kshift instructions of the correct width aren't natively supported
   // then we need to promote the vector to the native size to get the correct
   // zeroing behavior.
   if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
       (VecVT.getVectorNumElements() < 8)) {
     VecVT = MVT::v16i1;
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
                       DAG.getUNDEF(VecVT),
                       Vec,
                       DAG.getIntPtrConstant(0, dl));
   }
 
   // Use kshiftr instruction to move to the lower element.
   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                     DAG.getConstant(IdxVal, dl, MVT::i8));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 SDValue
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
 
   if (VecVT.getVectorElementType() == MVT::i1)
     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
 
   if (!isa<ConstantSDNode>(Idx)) {
     // Its more profitable to go through memory (1 cycles throughput)
     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
     // IACA tool was used to get performance estimation
     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
     //
     // example : extractelement <16 x i8> %a, i32 %i
     //
     // Block Throughput: 3.00 Cycles
     // Throughput Bottleneck: Port5
     //
     // | Num Of |   Ports pressure in cycles  |    |
     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
     // ---------------------------------------------
     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
     // Total Num Of Uops: 4
     //
     //
     // Block Throughput: 1.00 Cycles
     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
     //
     // |    |  Ports pressure in cycles   |  |
     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
     // ---------------------------------------------------------
     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
     // Total Num Of Uops: 4
 
     return SDValue();
   }
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
     // Get the 128-bit vector.
     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
     MVT EltVT = VecVT.getVectorElementType();
 
     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
     // this can be done with a mask.
     IdxVal &= ElemsPerChunk - 1;
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
                        DAG.getConstant(IdxVal, dl, MVT::i32));
   }
 
   assert(VecVT.is128BitVector() && "Unexpected vector length");
 
   MVT VT = Op.getSimpleValueType();
 
   if (VT.getSizeInBits() == 16) {
     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
     // we're going to zero extend the register or fold the store (SSE41 only).
     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
 
     // Transform it so it match pextrw which produces a 32-bit result.
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
                                   Op.getOperand(0), Op.getOperand(1));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   }
 
   if (Subtarget.hasSSE41())
     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
       return Res;
 
   // TODO: We only extract a single element from v16i8, we can probably afford
   // to be more aggressive here before using the default approach of spilling to
   // stack.
   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
     // Extract either the lowest i32 or any i16, and extract the sub-byte.
     int DWordIdx = IdxVal / 4;
     if (DWordIdx == 0) {
       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                 DAG.getBitcast(MVT::v4i32, Vec),
                                 DAG.getIntPtrConstant(DWordIdx, dl));
       int ShiftVal = (IdxVal % 4) * 8;
       if (ShiftVal != 0)
         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
                           DAG.getConstant(ShiftVal, dl, MVT::i32));
       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
     }
 
     int WordIdx = IdxVal / 2;
     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
                               DAG.getBitcast(MVT::v8i16, Vec),
                               DAG.getIntPtrConstant(WordIdx, dl));
     int ShiftVal = (IdxVal % 2) * 8;
     if (ShiftVal != 0)
       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
                         DAG.getConstant(ShiftVal, dl, MVT::i16));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
   }
 
   if (VT.getSizeInBits() == 32) {
     if (IdxVal == 0)
       return Op;
 
     // SHUFPS the element to the lowest double word, then movss.
     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   if (VT.getSizeInBits() == 64) {
     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
     //        to match extract_elt for f64.
     if (IdxVal == 0)
       return Op;
 
     // UNPCKHPD the element to the lowest double word, then movsd.
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
     int Mask[2] = { 1, -1 };
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   return SDValue();
 }
 
 /// Insert one bit to mask vector, like v16i1 or v8i1.
 /// AVX-512 feature.
 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue Elt = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
   MVT VecVT = Vec.getSimpleValueType();
 
   if (!isa<ConstantSDNode>(Idx)) {
     // Non constant index. Extend source and destination,
     // insert element and then truncate the result.
     unsigned NumElts = VecVT.getVectorNumElements();
     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   }
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   unsigned NumElems = VecVT.getVectorNumElements();
 
   // If the kshift instructions of the correct width aren't natively supported
   // then we need to promote the vector to the native size to get the correct
   // zeroing behavior.
   if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
     // Need to promote to v16i1, do the insert, then extract back.
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
                       DAG.getUNDEF(MVT::v16i1), Vec,
                       DAG.getIntPtrConstant(0, dl));
     Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
 
   if (Vec.isUndef()) {
     if (IdxVal)
       EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
                              DAG.getConstant(IdxVal, dl, MVT::i8));
     return EltInVec;
   }
 
   // Insertion of one bit into first position
   if (IdxVal == 0 ) {
     // Clean top bits of vector.
     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
                            DAG.getConstant(NumElems - 1, dl, MVT::i8));
     EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
                            DAG.getConstant(NumElems - 1, dl, MVT::i8));
     // Clean the first bit in source vector.
     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                       DAG.getConstant(1 , dl, MVT::i8));
     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
                       DAG.getConstant(1, dl, MVT::i8));
 
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   }
   // Insertion of one bit into last position
   if (IdxVal == NumElems - 1) {
     // Move the bit to the last position inside the vector.
     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
                            DAG.getConstant(IdxVal, dl, MVT::i8));
     // Clean the last bit in the source vector.
     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
                       DAG.getConstant(1, dl, MVT::i8));
     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                       DAG.getConstant(1 , dl, MVT::i8));
 
     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
   }
 
   // Move the current value of the bit to be replace to bit 0.
   SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
                                DAG.getConstant(IdxVal, dl, MVT::i8));
   // Xor with the new bit.
   Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
   // Shift to MSB, filling bottom bits with 0.
   Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
                        DAG.getConstant(NumElems - 1, dl, MVT::i8));
   // Shift to the final position, filling upper bits with 0.
   Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
                        DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
   // Xor with original vector to cancel out the original bit value that's still
   // present.
   return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
 }
 
 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
 
   if (EltVT == MVT::i1)
     return InsertBitToMaskVector(Op, DAG, Subtarget);
 
   SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
   if (!isa<ConstantSDNode>(N2))
     return SDValue();
   auto *N2C = cast<ConstantSDNode>(N2);
   unsigned IdxVal = N2C->getZExtValue();
 
   bool IsZeroElt = X86::isZeroNode(N1);
   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
 
   // If we are inserting a element, see if we can do this more efficiently with
   // a blend shuffle with a rematerializable vector than a costly integer
   // insertion.
   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
       16 <= EltVT.getSizeInBits()) {
     SmallVector<int, 8> BlendMask;
     for (unsigned i = 0; i != NumElts; ++i)
       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
                                   : getOnesVector(VT, DAG, dl);
     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
   }
 
   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
   // into that, and then insert the subvector back into the result.
   if (VT.is256BitVector() || VT.is512BitVector()) {
     // With a 256-bit vector, we can insert into the zero element efficiently
     // using a blend if we have AVX or AVX2 and the right data type.
     if (VT.is256BitVector() && IdxVal == 0) {
       // TODO: It is worthwhile to cast integer to floating point and back
       // and incur a domain crossing penalty if that's what we'll end up
       // doing anyway after extracting to a 128-bit vector.
       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
         N2 = DAG.getIntPtrConstant(1, dl);
         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
       }
     }
 
     // Get the desired 128-bit vector chunk.
     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired chunk.
     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
     assert(isPowerOf2_32(NumEltsIn128));
     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
 
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
                     DAG.getConstant(IdxIn128, dl, MVT::i32));
 
     // Insert the changed part back into the bigger vector
     return insert128BitVector(N0, V, IdxVal, DAG, dl);
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   // argument. SSE41 required for pinsrb.
   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
     unsigned Opc;
     if (VT == MVT::v8i16) {
       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
       Opc = X86ISD::PINSRW;
     } else {
       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
       Opc = X86ISD::PINSRB;
     }
 
     if (N1.getValueType() != MVT::i32)
       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
     if (N2.getValueType() != MVT::i32)
       N2 = DAG.getIntPtrConstant(IdxVal, dl);
     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   }
 
   if (Subtarget.hasSSE41()) {
     if (EltVT == MVT::f32) {
       // Bits [7:6] of the constant are the source select. This will always be
       //   zero here. The DAG Combiner may combine an extract_elt index into
       //   these bits. For example (insert (extract, 3), 2) could be matched by
       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
       // Bits [5:4] of the constant are the destination select. This is the
       //   value of the incoming immediate.
       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
       //   combine either bitwise AND or insert of float 0.0 to set these bits.
 
       bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
         // If this is an insertion of 32-bits into the low 32-bits of
         // a vector, we prefer to generate a blend with immediate rather
         // than an insertps. Blends are simpler operations in hardware and so
         // will always have equal or better performance than insertps.
         // But if optimizing for size and there's a load folding opportunity,
         // generate insertps because blendps does not have a 32-bit memory
         // operand form.
         N2 = DAG.getIntPtrConstant(1, dl);
         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
       }
       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
       // Create this as a scalar to vector..
       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
     }
 
     // PINSR* works with constant index.
     if (EltVT == MVT::i32 || EltVT == MVT::i64)
       return Op;
   }
 
   return SDValue();
 }
 
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT OpVT = Op.getSimpleValueType();
 
   // It's always cheaper to replace a xor+movd with xorps and simplifies further
   // combines.
   if (X86::isZeroNode(Op.getOperand(0)))
     return getZeroVector(OpVT, Subtarget, DAG, dl);
 
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
   if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
                                  OpVT.getVectorNumElements() / SizeFactor);
 
     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
 
     // Insert the 128-bit vector.
     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   }
   assert(OpVT.is128BitVector() && "Expected an SSE type!");
 
   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
   if (OpVT == MVT::v4i32)
     return Op;
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   return DAG.getBitcast(
       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
 }
 
 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
 // simple superregister reference or explicit instructions to insert
 // the upper bits of a vector.
 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
 
   return insert1BitVector(Op, DAG, Subtarget);
 }
 
 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
          "Only vXi1 extract_subvectors need custom lowering");
 
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
 
   if (!isa<ConstantSDNode>(Idx))
     return SDValue();
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal == 0) // the operation is legal
     return Op;
 
   MVT VecVT = Vec.getSimpleValueType();
   unsigned NumElems = VecVT.getVectorNumElements();
 
   // Extend to natively supported kshift.
   MVT WideVecVT = VecVT;
   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
                       DAG.getUNDEF(WideVecVT), Vec,
                       DAG.getIntPtrConstant(0, dl));
   }
 
   // Shift to the LSB.
   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
                     DAG.getConstant(IdxVal, dl, MVT::i8));
 
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
   // References to absolute symbols are never PC-relative.
   if (GV && GV->isAbsoluteSymbolRef())
     return X86ISD::Wrapper;
 
   CodeModel::Model M = getTargetMachine().getCodeModel();
   if (Subtarget.isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
     return X86ISD::WrapperRIP;
 
   return X86ISD::Wrapper;
 }
 
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOV32ri.
 SDValue
 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetConstantPool(
       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
   SDLoc DL(CP);
   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
   // With PIC, the address is actually $g + Offset.
   if (OpFlag) {
     Result =
         DAG.getNode(ISD::ADD, DL, PtrVT,
                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   }
 
   return Result;
 }
 
 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
   SDLoc DL(JT);
   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (OpFlag)
     Result =
         DAG.getNode(ISD::ADD, DL, PtrVT,
                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
 
   return Result;
 }
 
 SDValue
 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
 
   SDLoc DL(Op);
   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isPositionIndependent() && !Subtarget.is64Bit()) {
     Result =
         DAG.getNode(ISD::ADD, DL, PtrVT,
                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   }
 
   // For symbols that require a load from a stub to get the address, emit the
   // load.
   if (isGlobalStubReference(OpFlag))
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 
   return Result;
 }
 
 SDValue
 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   // Create the TargetBlockAddressAddress node.
   unsigned char OpFlags =
     Subtarget.classifyBlockAddressReference();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   }
 
   return Result;
 }
 
 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
                                               const SDLoc &dl, int64_t Offset,
                                               SelectionDAG &DAG) const {
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
   CodeModel::Model M = DAG.getTarget().getCodeModel();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
   if (OpFlags == X86II::MO_NO_FLAG &&
       X86::isOffsetSuitableForCodeModel(Offset, M)) {
     // A direct static reference to a global.
     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
     Offset = 0;
   } else {
     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   }
 
   Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   }
 
   // For globals that require a load from a stub to get the address, emit the
   // load.
   if (isGlobalStubReference(OpFlags))
     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 
   // If there was a non-zero offset that we didn't fold, create an explicit
   // addition for it.
   if (Offset != 0)
     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
                          DAG.getConstant(Offset, dl, PtrVT));
 
   return Result;
 }
 
 SDValue
 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
 }
 
 static SDValue
 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
            unsigned char OperandFlags, bool LocalDynamic = false) {
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDLoc dl(GA);
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(),
                                            OperandFlags);
 
   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
                                            : X86ISD::TLSADDR;
 
   if (InFlag) {
     SDValue Ops[] = { Chain,  TGA, *InFlag };
     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   } else {
     SDValue Ops[]  = { Chain, TGA };
     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   }
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   MFI.setAdjustsStack(true);
   MFI.setHasCalls(true);
 
   SDValue Flag = Chain.getValue(1);
   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
 static SDValue
 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
   SDValue InFlag;
   SDLoc dl(GA);  // ? function entry point might be better
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
                                    DAG.getNode(X86ISD::GlobalBaseReg,
                                                SDLoc(), PtrVT), InFlag);
   InFlag = Chain.getValue(1);
 
   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
 static SDValue
 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
                     X86::RAX, X86II::MO_TLSGD);
 }
 
 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
                                            SelectionDAG &DAG,
                                            const EVT PtrVT,
                                            bool is64Bit) {
   SDLoc dl(GA);
 
   // Get the start address of the TLS block for this module.
   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
       .getInfo<X86MachineFunctionInfo>();
   MFI->incNumLocalDynamicTLSAccesses();
 
   SDValue Base;
   if (is64Bit) {
     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   } else {
     SDValue InFlag;
     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
     InFlag = Chain.getValue(1);
     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
   }
 
   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
   // of Base.
 
   // Build x@dtpoff.
   unsigned char OperandFlags = X86II::MO_DTPOFF;
   unsigned WrapperKind = X86ISD::Wrapper;
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   // Add x@dtpoff with the base.
   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
 }
 
 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                    const EVT PtrVT, TLSModel::Model model,
                                    bool is64Bit, bool isPIC) {
   SDLoc dl(GA);
 
   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
                                                          is64Bit ? 257 : 256));
 
   SDValue ThreadPointer =
       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
                   MachinePointerInfo(Ptr));
 
   unsigned char OperandFlags = 0;
   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   // initialexec.
   unsigned WrapperKind = X86ISD::Wrapper;
   if (model == TLSModel::LocalExec) {
     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   } else if (model == TLSModel::InitialExec) {
     if (is64Bit) {
       OperandFlags = X86II::MO_GOTTPOFF;
       WrapperKind = X86ISD::WrapperRIP;
     } else {
       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
     }
   } else {
     llvm_unreachable("Unexpected model");
   }
 
   // emit "addl x@ntpoff,%eax" (local exec)
   // or "addl x@indntpoff,%eax" (initial exec)
   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   SDValue TGA =
       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
                                  GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   if (model == TLSModel::InitialExec) {
     if (isPIC && !is64Bit) {
       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
     }
 
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   }
 
   // The address of the thread local variable is the add of the thread
   // pointer with the offset of the variable.
   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
 }
 
 SDValue
 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   if (DAG.getTarget().Options.EmulatedTLS)
     return LowerToTLSEmulatedModel(GA, DAG);
 
   const GlobalValue *GV = GA->getGlobal();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   bool PositionIndependent = isPositionIndependent();
 
   if (Subtarget.isTargetELF()) {
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
       case TLSModel::GeneralDynamic:
         if (Subtarget.is64Bit())
           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
       case TLSModel::LocalDynamic:
         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
                                            Subtarget.is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
                                    PositionIndependent);
     }
     llvm_unreachable("Unknown TLS model.");
   }
 
   if (Subtarget.isTargetDarwin()) {
     // Darwin only has one model of TLS.  Lower to that.
     unsigned char OpFlag = 0;
     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
                            X86ISD::WrapperRIP : X86ISD::Wrapper;
 
     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
     // global base reg.
     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
     if (PIC32)
       OpFlag = X86II::MO_TLVP_PIC_BASE;
     else
       OpFlag = X86II::MO_TLVP;
     SDLoc DL(Op);
     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                 GA->getValueType(0),
                                                 GA->getOffset(), OpFlag);
     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
     // With PIC32, the address is actually $g + Offset.
     if (PIC32)
       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
 
     // Lowering the machine isd will make sure everything is in the right
     // location.
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
     SDValue Args[] = { Chain, Offset };
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
                                DAG.getIntPtrConstant(0, DL, true),
                                Chain.getValue(1), DL);
 
     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     MFI.setAdjustsStack(true);
 
     // And our return value (tls address) is in the standard call return value
     // location.
     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
   }
 
   if (Subtarget.isTargetKnownWindowsMSVC() ||
       Subtarget.isTargetWindowsItanium() ||
       Subtarget.isTargetWindowsGNU()) {
     // Just use the implicit TLS architecture
     // Need to generate something similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
     //                                  ; from TEB
     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
     //   mov     rcx, qword [rdx+rcx*8]
     //   mov     eax, .tls$:tlsvar
     //   [rax+rcx] contains the address
     // Windows 64bit: gs:0x58
     // Windows 32bit: fs:__tls_array
 
     SDLoc dl(GA);
     SDValue Chain = DAG.getEntryNode();
 
     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
     // use its literal value of 0x2C.
     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
                                         ? Type::getInt8PtrTy(*DAG.getContext(),
                                                              256)
                                         : Type::getInt32PtrTy(*DAG.getContext(),
                                                               257));
 
     SDValue TlsArray = Subtarget.is64Bit()
                            ? DAG.getIntPtrConstant(0x58, dl)
                            : (Subtarget.isTargetWindowsGNU()
                                   ? DAG.getIntPtrConstant(0x2C, dl)
                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
 
     SDValue ThreadPointer =
         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
 
     SDValue res;
     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
       res = ThreadPointer;
     } else {
       // Load the _tls_index variable
       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
       if (Subtarget.is64Bit())
         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
                              MachinePointerInfo(), MVT::i32);
       else
         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
 
       auto &DL = DAG.getDataLayout();
       SDValue Scale =
           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
 
       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
     }
 
     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
 
     // Get the offset of start of .tls section
     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                              GA->getValueType(0),
                                              GA->getOffset(), X86II::MO_SECREL);
     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
 
     // The address of the thread local variable is the add of the thread
     // pointer with the offset of the variable.
     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
   }
 
   llvm_unreachable("TLS not implemented for this target.");
 }
 
 /// Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   MVT VT = Op.getSimpleValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
   // during isel.
   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
                        : DAG.getConstant(0, dl, VT);
 
   SDValue Tmp2, Tmp3;
   if (Op.getOpcode() == ISD::SHL_PARTS) {
     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   } else {
     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   }
 
   // If the shift amount is larger or equal than the width of a part we can't
   // rely on the results of shld/shrd. Insert a test and select the appropriate
   // values for large shift amounts.
   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                 DAG.getConstant(VTBits, dl, MVT::i8));
   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                              AndNode, DAG.getConstant(0, dl, MVT::i8));
 
   SDValue Hi, Lo;
   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
 
   if (Op.getOpcode() == ISD::SHL_PARTS) {
     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   } else {
     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   }
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue Src = Op.getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   if (SrcVT.isVector()) {
     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
                                      DAG.getUNDEF(SrcVT)));
     }
     if (SrcVT == MVT::v2i1) {
       // For v2i1, we need to widen to v4i1 first.
       assert(VT == MVT::v2f64 && "Unexpected type");
       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
                         DAG.getUNDEF(MVT::v2i1));
       return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
                          DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
     }
     return SDValue();
   }
 
   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
 
   // These are really Legal; return the operand so the caller accepts it as
   // Legal.
   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
     return Op;
   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
       Subtarget.is64Bit()) {
     return Op;
   }
 
   SDValue ValueToStore = Op.getOperand(0);
   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
       !Subtarget.is64Bit())
     // Bitcasting to f64 here allows us to do a single 64-bit store from
     // an SSE register, avoiding the store forwarding penalty that would come
     // with two 32-bit stores.
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
 
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
   auto PtrVT = getPointerTy(MF.getDataLayout());
   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   SDValue Chain = DAG.getStore(
       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
 }
 
 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                      SDValue StackSlot,
                                      SelectionDAG &DAG) const {
   // Build the FILD
   SDLoc DL(Op);
   SDVTList Tys;
   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   if (useSSE)
     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   else
     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
 
   unsigned ByteSize = SrcVT.getSizeInBits()/8;
 
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   MachineMemOperand *MMO;
   if (FI) {
     int SSFI = FI->getIndex();
     MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
         MachineMemOperand::MOLoad, ByteSize, ByteSize);
   } else {
     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
     StackSlot = StackSlot.getOperand(1);
   }
   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
                                            X86ISD::FILD, DL,
                                            Tys, Ops, SrcVT, MMO);
 
   if (useSSE) {
     Chain = Result.getValue(1);
     SDValue InFlag = Result.getValue(2);
 
     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
     // shouldn't be necessary except that RFP cannot be live across
     // multiple blocks. When stackifier is fixed, they can be uncoupled.
     MachineFunction &MF = DAG.getMachineFunction();
     unsigned SSFISize = Op.getValueSizeInBits()/8;
     int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
     auto PtrVT = getPointerTy(MF.getDataLayout());
     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
     Tys = DAG.getVTList(MVT::Other);
     SDValue Ops[] = {
       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
     };
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
         MachineMemOperand::MOStore, SSFISize, SSFISize);
 
     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
                                     Ops, Op.getValueType(), MMO);
     Result = DAG.getLoad(
         Op.getValueType(), DL, Chain, StackSlot,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
   }
 
   return Result;
 }
 
 /// 64-bit unsigned integer to double expansion.
 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   // This algorithm is not obvious. Here it is what we're trying to output:
   /*
      movq       %rax,  %xmm0
      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
      #ifdef __SSE3__
        haddpd   %xmm0, %xmm0
      #else
        pshufd   $0x4e, %xmm0, %xmm1
        addpd    %xmm1, %xmm0
      #endif
   */
 
   SDLoc dl(Op);
   LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
 
   SmallVector<Constant*,2> CV1;
   CV1.push_back(
     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
                                       APInt(64, 0x4330000000000000ULL))));
   CV1.push_back(
     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
                                       APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
 
   // Load the 64-bit value into an XMM register.
   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                             Op.getOperand(0));
   SDValue CLod0 =
       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
                   /* Alignment = */ 16);
   SDValue Unpck1 =
       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
 
   SDValue CLod1 =
       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
                   /* Alignment = */ 16);
   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
   if (Subtarget.hasSSE3()) {
     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   } else {
     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
     SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
   }
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 /// 32-bit unsigned integer to float expansion.
 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   // FP constant to bias correct the final result.
   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
                                    MVT::f64);
 
   // Load the 32-bit value into an XMM register.
   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
                              Op.getOperand(0));
 
   // Zero out the upper parts of the register.
   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
 
   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                      DAG.getBitcast(MVT::v2f64, Load),
                      DAG.getIntPtrConstant(0, dl));
 
   // Or the load with the bias.
   SDValue Or = DAG.getNode(
       ISD::OR, dl, MVT::v2i64,
       DAG.getBitcast(MVT::v2i64,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
       DAG.getBitcast(MVT::v2i64,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
   Or =
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
 
   // Subtract the bias.
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
 
   // Handle final rounding.
   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
 }
 
 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget, SDLoc &DL) {
   if (Op.getSimpleValueType() != MVT::v2f64)
     return SDValue();
 
   SDValue N0 = Op.getOperand(0);
   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
 
   // Legalize to v4i32 type.
   N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
                    DAG.getUNDEF(MVT::v2i32));
 
   if (Subtarget.hasAVX512())
     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
 
   // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
   // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
   SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
   SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
 
   // Two to the power of half-word-size.
   SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
 
   // Clear upper part of LO, lower HI.
   SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
   SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
 
   SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
           fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
   SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
 
   // Add the two halves.
   return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
 }
 
 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   // The algorithm is the following:
   // #ifdef __SSE4_1__
   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   //                                 (uint4) 0x53000000, 0xaa);
   // #else
   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   // #endif
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   //     return (float4) lo + fhi;
 
   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
   // reassociate the two FADDs, and if we do that, the algorithm fails
   // spectacularly (PR24512).
   // FIXME: If we ever have some kind of Machine FMF, this should be marked
   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
   // there's also the MachineCombiner reassociations happening on Machine IR.
   if (DAG.getTarget().Options.UnsafeFPMath)
     return SDValue();
 
   SDLoc DL(Op);
   SDValue V = Op->getOperand(0);
   MVT VecIntVT = V.getSimpleValueType();
   bool Is128 = VecIntVT == MVT::v4i32;
   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
   // If we convert to something else than the supported type, e.g., to v4f64,
   // abort early.
   if (VecFloatVT != Op->getSimpleValueType(0))
     return SDValue();
 
   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
          "Unsupported custom type");
 
   // In the #idef/#else code, we have in common:
   // - The vector of constants:
   // -- 0x4b000000
   // -- 0x53000000
   // - A shift:
   // -- v >> 16
 
   // Create the splat vector for 0x4b000000.
   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
   // Create the splat vector for 0x53000000.
   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
 
   // Create the right shift.
   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
 
   SDValue Low, High;
   if (Subtarget.hasSSE41()) {
     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
     // Low will be bitcasted right away, so do not bother bitcasting back to its
     // original type.
     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
     //                                 (uint4) 0x53000000, 0xaa);
     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
     // High will be bitcasted right away, so do not bother bitcasting back to
     // its original type.
     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   } else {
     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
 
     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
   }
 
   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
   SDValue VecCstFAdd = DAG.getConstantFP(
       APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
 
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue FHigh =
       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
   //     return (float4) lo + fhi;
   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
 }
 
 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   SDValue N0 = Op.getOperand(0);
   MVT SrcVT = N0.getSimpleValueType();
   SDLoc dl(Op);
 
   if (SrcVT == MVT::v2i1) {
     // For v2i1, we need to widen to v4i1 first.
     assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
                      DAG.getUNDEF(MVT::v2i1));
     return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
   }
 
   switch (SrcVT.SimpleTy) {
   default:
     llvm_unreachable("Custom UINT_TO_FP is not supported!");
   case MVT::v2i32:
     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
   case MVT::v4i32:
   case MVT::v8i32:
     assert(!Subtarget.hasAVX512());
     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
   }
 }
 
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Op.getSimpleValueType().isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
 
   MVT SrcVT = N0.getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
     // Conversions from unsigned i32 to f32/f64 are legal,
     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
     return Op;
   }
 
   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
     return SDValue();
 
   // Make a 64-bit buffer, and use it to build an FILD.
   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   if (SrcVT == MVT::i32) {
     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                   StackSlot, MachinePointerInfo());
     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
                                   OffsetSlot, MachinePointerInfo());
     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
     return Fild;
   }
 
   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   SDValue ValueToStore = Op.getOperand(0);
   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
     // Bitcasting to f64 here allows us to do a single 64-bit store from
     // an SSE register, avoiding the store forwarding penalty that would come
     // with two 32-bit stores.
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
                                MachinePointerInfo());
   // For i64 source, we need to add the appropriate power of 2 if the input
   // was negative.  This is the same as the optimization in
   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   // we must be careful to do the computation in x87 extended precision, not
   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
       MachineMemOperand::MOLoad, 8, 8);
 
   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
                                          MVT::i64, MMO);
 
   APInt FF(32, 0x5F800000ULL);
 
   // Check whether the sign bit is set.
   SDValue SignSet = DAG.getSetCC(
       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
 
   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   SDValue FudgePtr = DAG.getConstantPool(
       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
 
   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   SDValue Four = DAG.getIntPtrConstant(4, dl);
   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
 
   // Load the value out, extending it from f32 to f80.
   // FIXME: Avoid the extend by constructing the right constant pool?
   SDValue Fudge = DAG.getExtLoad(
       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
       /* Alignment = */ 4);
   // Extend everything to 80 bits to force it to be done on x87.
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
 // just return an <SDValue(), SDValue()> pair.
 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
 // to i16, i32 or i64, and we lower it to a legal sequence.
 // If lowered to the final integer result we return a <result, SDValue()> pair.
 // Otherwise we lower it to a sequence ending with a FIST, return a
 // <FIST, StackSlot> pair, and the caller is responsible for loading
 // the final integer result from StackSlot.
 std::pair<SDValue,SDValue>
 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                    bool IsSigned, bool IsReplace) const {
   SDLoc DL(Op);
 
   EVT DstTy = Op.getValueType();
   EVT TheVT = Op.getOperand(0).getValueType();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
     // f16 must be promoted before using the lowering in this routine.
     // fp128 does not use this lowering.
     return std::make_pair(SDValue(), SDValue());
   }
 
   // If using FIST to compute an unsigned i64, we'll need some fixup
   // to handle values above the maximum signed i64.  A FIST is always
   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
   bool UnsignedFixup = !IsSigned &&
                        DstTy == MVT::i64 &&
                        (!Subtarget.is64Bit() ||
                         !isScalarFPTypeInSSEReg(TheVT));
 
   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
     // The low 32 bits of the fist result will have the correct uint32 result.
     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
     DstTy = MVT::i64;
   }
 
   assert(DstTy.getSimpleVT() <= MVT::i64 &&
          DstTy.getSimpleVT() >= MVT::i16 &&
          "Unknown FP_TO_INT to lower!");
 
   // These are really Legal.
   if (DstTy == MVT::i32 &&
       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
     return std::make_pair(SDValue(), SDValue());
   if (Subtarget.is64Bit() &&
       DstTy == MVT::i64 &&
       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
     return std::make_pair(SDValue(), SDValue());
 
   // We lower FP->int64 into FISTP64 followed by a load from a temporary
   // stack slot.
   MachineFunction &MF = DAG.getMachineFunction();
   unsigned MemSize = DstTy.getSizeInBits()/8;
   int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
 
   unsigned Opc;
   switch (DstTy.getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   }
 
   SDValue Chain = DAG.getEntryNode();
   SDValue Value = Op.getOperand(0);
   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
 
   if (UnsignedFixup) {
     //
     // Conversion to unsigned i64 is implemented with a select,
     // depending on whether the source value fits in the range
     // of a signed i64.  Let Thresh be the FP equivalent of
     // 0x8000000000000000ULL.
     //
     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
     //  Fist-to-mem64 FistSrc
     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
     //  to XOR'ing the high 32 bits with Adjust.
     //
     // Being a power of 2, Thresh is exactly representable in all FP formats.
     // For X87 we'd like to use the smallest FP type for this constant, but
     // for DAG type consistency we have to match the FP operand type.
 
     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
     bool LosesInfo = false;
     if (TheVT == MVT::f64)
       // The rounding mode is irrelevant as the conversion should be exact.
       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
                               &LosesInfo);
     else if (TheVT == MVT::f80)
       Status = Thresh.convert(APFloat::x87DoubleExtended(),
                               APFloat::rmNearestTiesToEven, &LosesInfo);
 
     assert(Status == APFloat::opOK && !LosesInfo &&
            "FP conversion should have been exact");
 
     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
 
     SDValue Cmp = DAG.getSetCC(DL,
                                getSetCCResultType(DAG.getDataLayout(),
                                                   *DAG.getContext(), TheVT),
                                Value, ThreshVal, ISD::SETLT);
     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
                            DAG.getConstant(0, DL, MVT::i32),
                            DAG.getConstant(0x80000000, DL, MVT::i32));
     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
                                               *DAG.getContext(), TheVT),
                        Value, ThreshVal, ISD::SETLT);
     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
   }
 
   // FIXME This causes a redundant load/store if the SSE-class value is already
   // in memory, such as if it is on the callstack.
   if (isScalarFPTypeInSSEReg(TheVT)) {
     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
                          MachinePointerInfo::getFixedStack(MF, SSFI));
     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
     SDValue Ops[] = {
       Chain, StackSlot, DAG.getValueType(TheVT)
     };
 
     MachineMemOperand *MMO =
         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
                                 MachineMemOperand::MOLoad, MemSize, MemSize);
     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
     Chain = Value.getValue(1);
     SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   }
 
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
                               MachineMemOperand::MOStore, MemSize, MemSize);
 
   if (UnsignedFixup) {
 
     // Insert the FIST, load its result as two i32's,
     // and XOR the high i32 with Adjust.
 
     SDValue FistOps[] = { Chain, Value, StackSlot };
     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
                                            FistOps, DstTy, MMO);
 
     SDValue Low32 =
         DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
 
     SDValue High32 =
         DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
 
     if (Subtarget.is64Bit()) {
       // Join High32 and Low32 into a 64-bit result.
       // (High32 << 32) | Low32
       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
                            DAG.getConstant(32, DL, MVT::i8));
       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
       return std::make_pair(Result, SDValue());
     }
 
     SDValue ResultOps[] = { Low32, High32 };
 
     SDValue pair = IsReplace
       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
       : DAG.getMergeValues(ResultOps, DL);
     return std::make_pair(pair, SDValue());
   } else {
     // Build the FP_TO_INT*_IN_MEM
     SDValue Ops[] = { Chain, Value, StackSlot };
     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
                                            Ops, DstTy, MMO);
     return std::make_pair(FIST, StackSlot);
   }
 }
 
 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
   if ((VT != MVT::v4i64  || InVT != MVT::v4i32) &&
       (VT != MVT::v8i32  || InVT != MVT::v8i16) &&
       (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
       (VT != MVT::v8i64  || InVT != MVT::v8i32) &&
       (VT != MVT::v8i64  || InVT != MVT::v8i16) &&
       (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
       (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
       (VT != MVT::v32i16 || InVT != MVT::v32i8))
     return SDValue();
 
   if (Subtarget.hasInt256())
     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
 
   // Optimize vectors in AVX mode:
   //
   //   v8i16 -> v8i32
   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   //   Concat upper and lower parts.
   //
   //   v4i32 -> v4i64
   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   //   Concat upper and lower parts.
   //
 
   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
   SDValue Undef = DAG.getUNDEF(InVT);
   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
 
   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
                              VT.getVectorNumElements()/2);
 
   OpLo = DAG.getBitcast(HVT, OpLo);
   OpHi = DAG.getBitcast(HVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
   SDLoc DL(Op);
   unsigned NumElts = VT.getVectorNumElements();
 
   // Extend VT if the scalar type is v8/v16 and BWI is not supported.
   MVT ExtVT = VT;
   if (!Subtarget.hasBWI() &&
       (VT.getVectorElementType().getSizeInBits() <= 16))
     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
 
   // Widen to 512-bits if VLX is not supported.
   MVT WideVT = ExtVT;
   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
     NumElts *= 512 / ExtVT.getSizeInBits();
     InVT = MVT::getVectorVT(MVT::i1, NumElts);
     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
                      In, DAG.getIntPtrConstant(0, DL));
     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
                               NumElts);
   }
 
   SDValue One = DAG.getConstant(1, DL, WideVT);
   SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
 
   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
 
   // Truncate if we had to extend i16/i8 above.
   if (VT != ExtVT) {
     WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
   }
 
   // Extract back to 128/256-bit if we widened.
   if (WideVT != VT)
     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
                               DAG.getIntPtrConstant(0, DL));
 
   return SelectedVal;
 }
 
 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getSimpleValueType();
 
   if (SVT.getVectorElementType() == MVT::i1)
     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
 
   if (Subtarget.hasFp256())
     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
       return Res;
 
   assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||
          Op.getSimpleValueType().getVectorNumElements() !=
              SVT.getVectorNumElements());
   return SDValue();
 }
 
 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
 /// It makes use of the fact that vectors with enough leading sign/zero bits
 /// prevent the PACKSS/PACKUS from saturating the results.
 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
 /// within each 128-bit lane.
 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
                                       const SDLoc &DL, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
          "Unexpected PACK opcode");
 
   // Requires SSE2 but AVX512 has fast truncate.
   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
     return SDValue();
 
   EVT SrcVT = In.getValueType();
 
   // No truncation required, we might get here due to recursive calls.
   if (SrcVT == DstVT)
     return In;
 
   // We only support vector truncation to 128bits or greater from a
   // 256bits or greater source.
   unsigned DstSizeInBits = DstVT.getSizeInBits();
   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
   if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
     return SDValue();
 
   LLVMContext &Ctx = *DAG.getContext();
   unsigned NumElems = SrcVT.getVectorNumElements();
   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
 
   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
 
   // Extract lower/upper subvectors.
   unsigned NumSubElts = NumElems / 2;
   SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
   SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
 
   // Pack to the largest type possible:
   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
   EVT InVT = MVT::i16, OutVT = MVT::i8;
   if (DstVT.getScalarSizeInBits() > 8 &&
       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
     InVT = MVT::i32;
     OutVT = MVT::i16;
   }
 
   unsigned SubSizeInBits = SrcSizeInBits / 2;
   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
 
   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
   if (SrcVT.is256BitVector()) {
     Lo = DAG.getBitcast(InVT, Lo);
     Hi = DAG.getBitcast(InVT, Hi);
     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
     return DAG.getBitcast(DstVT, Res);
   }
 
   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
     Lo = DAG.getBitcast(InVT, Lo);
     Hi = DAG.getBitcast(InVT, Hi);
     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
 
     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
     Res = DAG.getBitcast(MVT::v4i64, Res);
     Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
 
     if (DstVT.is256BitVector())
       return DAG.getBitcast(DstVT, Res);
 
     // If 512bit -> 128bit truncate another stage.
     EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
     Res = DAG.getBitcast(PackedVT, Res);
     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
   }
 
   // Recursively pack lower/upper subvectors, concat result and pack again.
   assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
 
   PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
 }
 
 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
 
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
 
   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
 
   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
   if (InVT.getScalarSizeInBits() <= 16) {
     if (Subtarget.hasBWI()) {
       // legal, will go to VPMOVB2M, VPMOVW2M
       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
         // We need to shift to get the lsb into sign position.
         // Shift packed bytes not supported natively, bitcast to word
         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
         In = DAG.getNode(ISD::SHL, DL, ExtVT,
                          DAG.getBitcast(ExtVT, In),
                          DAG.getConstant(ShiftInx, DL, ExtVT));
         In = DAG.getBitcast(InVT, In);
       }
       return DAG.getNode(X86ISD::CVT2MASK, DL, VT, In);
     }
     // Use TESTD/Q, extended vector to packed dword/qword.
     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
            "Unexpected vector type.");
     unsigned NumElts = InVT.getVectorNumElements();
     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
     InVT = ExtVT;
     ShiftInx = InVT.getScalarSizeInBits() - 1;
   }
 
   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
     // We need to shift to get the lsb into sign position.
     In = DAG.getNode(ISD::SHL, DL, InVT, In,
                      DAG.getConstant(ShiftInx, DL, InVT));
   }
   return DAG.getNode(X86ISD::TESTM, DL, VT, In, In);
 }
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
   unsigned InNumEltBits = InVT.getScalarSizeInBits();
 
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
          "Invalid TRUNCATE operation");
 
   if (VT.getVectorElementType() == MVT::i1)
     return LowerTruncateVecI1(Op, DAG, Subtarget);
 
   // vpmovqb/w/d, vpmovdb/w, vpmovwb
   if (Subtarget.hasAVX512()) {
     // word to byte only under BWI
     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
                          getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   }
 
   // Truncate with PACKSS if we are truncating a vector with sign-bits that
   // extend all the way to the packed/truncated value.
   unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
   if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
     if (SDValue V =
             truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
       return V;
 
   // Truncate with PACKUS if we are truncating a vector with leading zero bits
   // that extend all the way to the packed/truncated value.
   // Pre-SSE41 we can only use PACKUSWB.
   KnownBits Known;
   DAG.computeKnownBits(In, Known);
   NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
   if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
     if (SDValue V =
             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
       return V;
 
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
       In = DAG.getBitcast(MVT::v8i32, In);
       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
                          DAG.getIntPtrConstant(0, DL));
     }
 
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                DAG.getIntPtrConstant(0, DL));
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                DAG.getIntPtrConstant(2, DL));
     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
     static const int ShufMask[] = {0, 2, 4, 6};
     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   }
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
       In = DAG.getBitcast(MVT::v32i8, In);
 
       // The PSHUFB mask:
       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
                                       -1, -1, -1, -1, -1, -1, -1, -1,
                                       16, 17, 20, 21, 24, 25, 28, 29,
                                       -1, -1, -1, -1, -1, -1, -1, -1 };
       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
 
       static const int ShufMask2[] = {0,  2,  -1,  -1};
       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                        DAG.getIntPtrConstant(0, DL));
       return DAG.getBitcast(VT, In);
     }
 
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
                                DAG.getIntPtrConstant(0, DL));
 
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
                                DAG.getIntPtrConstant(4, DL));
 
     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
 
     // The PSHUFB mask:
     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                    -1, -1, -1, -1, -1, -1, -1, -1};
 
     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
 
     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
 
     // The MOVLHPS Mask:
     static const int ShufMask2[] = {0, 1, 4, 5};
     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
     return DAG.getBitcast(MVT::v8i16, res);
   }
 
   // Handle truncation of V256 to V128 using shuffles.
   if (!VT.is128BitVector() || !InVT.is256BitVector())
     return SDValue();
 
   assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
 
   unsigned NumElems = VT.getVectorNumElements();
   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
 
   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
   // Prepare truncation shuffle mask
   for (unsigned i = 0; i != NumElems; ++i)
     MaskVec[i] = i * 2;
   In = DAG.getBitcast(NVT, In);
   SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
                      DAG.getIntPtrConstant(0, DL));
 }
 
 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
   MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) {
     SDValue Src = Op.getOperand(0);
     SDLoc dl(Op);
 
     if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
       MVT ResVT = MVT::v4i32;
       MVT TruncVT = MVT::v4i1;
       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
       if (!IsSigned && !Subtarget.hasVLX()) {
         // Widen to 512-bits.
         ResVT = MVT::v8i32;
         TruncVT = MVT::v8i1;
         Opc = ISD::FP_TO_UINT;
         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
                           DAG.getUNDEF(MVT::v8f64),
                           Src, DAG.getIntPtrConstant(0, dl));
       }
       SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
                          DAG.getIntPtrConstant(0, dl));
     }
 
     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
       return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                      DAG.getUNDEF(MVT::v2f32)));
     }
 
     return SDValue();
   }
 
   assert(!VT.isVector());
 
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
     IsSigned, /*IsReplace=*/ false);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   if (!FIST.getNode())
     return Op;
 
   if (StackSlot.getNode())
     // Load the result.
     return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
 
   // The node is the result.
   return FIST;
 }
 
 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getSimpleValueType();
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
                                  In, DAG.getUNDEF(SVT)));
 }
 
 /// The only differences between FABS and FNEG are the mask and the logic op.
 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
          "Wrong opcode for lowering FABS or FNEG.");
 
   bool IsFABS = (Op.getOpcode() == ISD::FABS);
 
   // If this is a FABS and it has an FNEG user, bail out to fold the combination
   // into an FNABS. We'll lower the FABS after that if it is still in use.
   if (IsFABS)
     for (SDNode *User : Op->uses())
       if (User->getOpcode() == ISD::FNEG)
         return Op;
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   bool IsF128 = (VT == MVT::f128);
 
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   // decide if we should generate a 16-byte constant mask when we only need 4 or
   // 8 bytes for the scalar case.
 
   MVT LogicVT;
   MVT EltVT;
 
   if (VT.isVector()) {
     LogicVT = VT;
     EltVT = VT.getVectorElementType();
   } else if (IsF128) {
     // SSE instructions are used for optimized f128 logical operations.
     LogicVT = MVT::f128;
     EltVT = VT;
   } else {
     // There are no scalar bitwise logical SSE/AVX instructions, so we
     // generate a 16-byte vector constant and logic op even for the scalar case.
     // Using a 16-byte mask allows folding the load of the mask with
     // the logic op, so it can save (~4 bytes) on code size.
     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
     EltVT = VT;
   }
 
   unsigned EltBits = EltVT.getSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
   APInt MaskElt =
     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
   const fltSemantics &Sem =
       EltVT == MVT::f64 ? APFloat::IEEEdouble() :
           (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
 
   SDValue Op0 = Op.getOperand(0);
   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
   unsigned LogicOp =
     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
 
   if (VT.isVector() || IsF128)
     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
 
   // For the scalar case extend to a 128-bit vector, perform the logic op,
   // and extract the scalar result back out.
   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue Mag = Op.getOperand(0);
   SDValue Sign = Op.getOperand(1);
   SDLoc dl(Op);
 
   // If the sign operand is smaller, extend it first.
   MVT VT = Op.getSimpleValueType();
   if (Sign.getSimpleValueType().bitsLT(VT))
     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
 
   // And if it is bigger, shrink it first.
   if (Sign.getSimpleValueType().bitsGT(VT))
     Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
 
   // At this point the operands and the result should have the same
   // type, and that won't be f80 since that is not custom lowered.
   bool IsF128 = (VT == MVT::f128);
   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
          "Unexpected type in LowerFCOPYSIGN");
 
   MVT EltVT = VT.getScalarType();
   const fltSemantics &Sem =
       EltVT == MVT::f64 ? APFloat::IEEEdouble()
                         : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
 
   // Perform all scalar logic operations as 16-byte vectors because there are no
   // scalar FP logic instructions in SSE.
   // TODO: This isn't necessary. If we used scalar types, we might avoid some
   // unnecessary splats, but we might miss load folding opportunities. Should
   // this decision be based on OptimizeForSize?
   bool IsFakeVector = !VT.isVector() && !IsF128;
   MVT LogicVT = VT;
   if (IsFakeVector)
     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
 
   // The mask constants are automatically splatted for vector types.
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   SDValue SignMask = DAG.getConstantFP(
       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
   SDValue MagMask = DAG.getConstantFP(
       APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
 
   // First, clear all bits but the sign bit from the second operand (sign).
   if (IsFakeVector)
     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
 
   // Next, clear the sign bit from the first operand (magnitude).
   // TODO: If we had general constant folding for FP logic ops, this check
   // wouldn't be necessary.
   SDValue MagBits;
   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
     APFloat APF = Op0CN->getValueAPF();
     APF.clearSign();
     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
   } else {
     // If the magnitude operand wasn't a constant, we need to AND out the sign.
     if (IsFakeVector)
       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
   }
 
   // OR the magnitude value with the sign bit.
   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
                                           DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   MVT OpVT = N0.getSimpleValueType();
   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
          "Unexpected type for FGETSIGN");
 
   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
   Res = DAG.getZExtOrTrunc(Res, dl, VT);
   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
   return Res;
 }
 
 // Check whether an OR'd tree is PTEST-able.
 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
 
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   if (!Op->hasOneUse())
     return SDValue();
 
   SDNode *N = Op.getNode();
   SDLoc DL(N);
 
   SmallVector<SDValue, 8> Opnds;
   DenseMap<SDValue, unsigned> VecInMap;
   SmallVector<SDValue, 8> VecIns;
   EVT VT = MVT::Other;
 
   // Recognize a special case where a vector is casted into wide integer to
   // test all 0s.
   Opnds.push_back(N->getOperand(0));
   Opnds.push_back(N->getOperand(1));
 
   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
     // BFS traverse all OR'd operands.
     if (I->getOpcode() == ISD::OR) {
       Opnds.push_back(I->getOperand(0));
       Opnds.push_back(I->getOperand(1));
       // Re-evaluate the number of nodes to be traversed.
       e += 2; // 2 more nodes (LHS and RHS) are pushed.
       continue;
     }
 
     // Quit if a non-EXTRACT_VECTOR_ELT
     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
 
     // Quit if without a constant index.
     SDValue Idx = I->getOperand(1);
     if (!isa<ConstantSDNode>(Idx))
       return SDValue();
 
     SDValue ExtractedFromVec = I->getOperand(0);
     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
     if (M == VecInMap.end()) {
       VT = ExtractedFromVec.getValueType();
       // Quit if not 128/256-bit vector.
       if (!VT.is128BitVector() && !VT.is256BitVector())
         return SDValue();
       // Quit if not the same type.
       if (VecInMap.begin() != VecInMap.end() &&
           VT != VecInMap.begin()->first.getValueType())
         return SDValue();
       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
       VecIns.push_back(ExtractedFromVec);
     }
     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   }
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Not extracted from 128-/256-bit vector.");
 
   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
 
   for (DenseMap<SDValue, unsigned>::const_iterator
         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
     // Quit if not all elements are used.
     if (I->second != FullMask)
       return SDValue();
   }
 
   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
 
   // Cast all vectors into TestVT for PTEST.
   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
 
   // If more than one full vector is evaluated, OR them first before PTEST.
   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
     // Each iteration will OR 2 nodes and append the result until there is only
     // 1 node left, i.e. the final OR'd value of all vectors.
     SDValue LHS = VecIns[Slot];
     SDValue RHS = VecIns[Slot + 1];
     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   }
 
   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
 }
 
 /// \brief return true if \c Op has a use that doesn't just read flags.
 static bool hasNonFlagsUse(SDValue Op) {
   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
        ++UI) {
     SDNode *User = *UI;
     unsigned UOpNo = UI.getOperandNo();
     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
       // Look pass truncate.
       UOpNo = User->use_begin().getOperandNo();
       User = *User->use_begin();
     }
 
     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
       return true;
   }
   return false;
 }
 
 // Emit KTEST instruction for bit vectors on AVX-512
 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
                          const X86Subtarget &Subtarget) {
   if (Op.getOpcode() == ISD::BITCAST) {
     auto hasKTEST = [&](MVT VT) {
       unsigned SizeInBits = VT.getSizeInBits();
       return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
         (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
     };
     SDValue Op0 = Op.getOperand(0);
     MVT Op0VT = Op0.getValueType().getSimpleVT();
     if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
         hasKTEST(Op0VT))
       return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
   }
   return SDValue();
 }
 
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
                                     SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::i1) {
     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
                        DAG.getConstant(0, dl, MVT::i8));
   }
   // CF and OF aren't always set the way we want. Determine which
   // of these we need.
   bool NeedCF = false;
   bool NeedOF = false;
   switch (X86CC) {
   default: break;
   case X86::COND_A: case X86::COND_AE:
   case X86::COND_B: case X86::COND_BE:
     NeedCF = true;
     break;
   case X86::COND_G: case X86::COND_GE:
   case X86::COND_L: case X86::COND_LE:
   case X86::COND_O: case X86::COND_NO: {
     // Check if we really need to set the
     // Overflow flag. If NoSignedWrap is present
     // that is not actually needed.
     switch (Op->getOpcode()) {
     case ISD::ADD:
     case ISD::SUB:
     case ISD::MUL:
     case ISD::SHL:
       if (Op.getNode()->getFlags().hasNoSignedWrap())
         break;
       LLVM_FALLTHROUGH;
     default:
       NeedOF = true;
       break;
     }
     break;
   }
   }
   // See if we can use the EFLAGS value from the operand instead of
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
     // Emit KTEST for bit vectors
     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
       return Node;
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
   }
   unsigned Opcode = 0;
   unsigned NumOperands = 0;
 
   // Truncate operations may prevent the merge of the SETCC instruction
   // and the arithmetic instruction before it. Attempt to truncate the operands
   // of the arithmetic instruction and use a reduced bit-width instruction.
   bool NeedTruncation = false;
   SDValue ArithOp = Op;
   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
     SDValue Arith = Op->getOperand(0);
     // Both the trunc and the arithmetic op need to have one user each.
     if (Arith->hasOneUse())
       switch (Arith.getOpcode()) {
         default: break;
         case ISD::ADD:
         case ISD::SUB:
         case ISD::AND:
         case ISD::OR:
         case ISD::XOR: {
           NeedTruncation = true;
           ArithOp = Arith;
         }
       }
   }
 
   // Sometimes flags can be set either with an AND or with an SRL/SHL
   // instruction. SRL/SHL variant should be preferred for masks longer than this
   // number of bits.
   const int ShiftToAndMaxMaskWidth = 32;
   const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
 
   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   // which may be the result of a CAST.  We use the variable 'Op', which is the
   // non-casted variable when we check for possible users.
   switch (ArithOp.getOpcode()) {
   case ISD::ADD:
     // We only want to rewrite this as a target-specific node with attached
     // flags if there is a reasonable chance of either using that to do custom
     // instructions selection that can fold some of the memory operands, or if
     // only the flags are used. If there are other uses, leave the node alone
     // and emit a test instruction.
     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
          UE = Op.getNode()->use_end(); UI != UE; ++UI)
       if (UI->getOpcode() != ISD::CopyToReg &&
           UI->getOpcode() != ISD::SETCC &&
           UI->getOpcode() != ISD::STORE)
         goto default_case;
 
     if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
       // An add of one will be selected as an INC.
       if (C->isOne() &&
           (!Subtarget.slowIncDec() ||
            DAG.getMachineFunction().getFunction().optForSize())) {
         Opcode = X86ISD::INC;
         NumOperands = 1;
         break;
       }
 
       // An add of negative one (subtract of one) will be selected as a DEC.
       if (C->isAllOnesValue() &&
           (!Subtarget.slowIncDec() ||
            DAG.getMachineFunction().getFunction().optForSize())) {
         Opcode = X86ISD::DEC;
         NumOperands = 1;
         break;
       }
     }
 
     // Otherwise use a regular EFLAGS-setting add.
     Opcode = X86ISD::ADD;
     NumOperands = 2;
     break;
   case ISD::SHL:
   case ISD::SRL:
     // If we have a constant logical shift that's only used in a comparison
     // against zero turn it into an equivalent AND. This allows turning it into
     // a TEST instruction later.
     if (ZeroCheck && Op->hasOneUse() &&
         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
       EVT VT = Op.getValueType();
       unsigned BitWidth = VT.getSizeInBits();
       unsigned ShAmt = Op->getConstantOperandVal(1);
       if (ShAmt >= BitWidth) // Avoid undefined shifts.
         break;
       APInt Mask = ArithOp.getOpcode() == ISD::SRL
                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
       if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
         break;
       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
                        DAG.getConstant(Mask, dl, VT));
     }
     break;
 
   case ISD::AND:
     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
     // because a TEST instruction will be better. However, AND should be
     // preferred if the instruction can be combined into ANDN.
     if (!hasNonFlagsUse(Op)) {
       SDValue Op0 = ArithOp->getOperand(0);
       SDValue Op1 = ArithOp->getOperand(1);
       EVT VT = ArithOp.getValueType();
       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
       bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
 
       // If we cannot select an ANDN instruction, check if we can replace
       // AND+IMM64 with a shift before giving up. This is possible for masks
       // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
       if (!isProperAndn) {
         if (!ZeroCheck)
           break;
 
         assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
         auto *CN = dyn_cast<ConstantSDNode>(Op1);
         if (!CN)
           break;
 
         const APInt &Mask = CN->getAPIntValue();
         if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
           break; // Prefer TEST instruction.
 
         unsigned BitWidth = Mask.getBitWidth();
         unsigned LeadingOnes = Mask.countLeadingOnes();
         unsigned TrailingZeros = Mask.countTrailingZeros();
 
         if (LeadingOnes + TrailingZeros == BitWidth) {
           assert(TrailingZeros < VT.getSizeInBits() &&
                  "Shift amount should be less than the type width");
           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
           SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
           Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
           break;
         }
 
         unsigned LeadingZeros = Mask.countLeadingZeros();
         unsigned TrailingOnes = Mask.countTrailingOnes();
 
         if (LeadingZeros + TrailingOnes == BitWidth) {
           assert(LeadingZeros < VT.getSizeInBits() &&
                  "Shift amount should be less than the type width");
           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
           SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
           Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
           break;
         }
 
         break;
       }
     }
     LLVM_FALLTHROUGH;
   case ISD::SUB:
   case ISD::OR:
   case ISD::XOR:
     // Similar to ISD::ADD above, check if the uses will preclude useful
     // lowering of the target-specific node.
     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
            UE = Op.getNode()->use_end(); UI != UE; ++UI)
       if (UI->getOpcode() != ISD::CopyToReg &&
           UI->getOpcode() != ISD::SETCC &&
           UI->getOpcode() != ISD::STORE)
         goto default_case;
 
     // Otherwise use a regular EFLAGS-setting instruction.
     switch (ArithOp.getOpcode()) {
     default: llvm_unreachable("unexpected operator!");
     case ISD::SUB: Opcode = X86ISD::SUB; break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR: {
       if (!NeedTruncation && ZeroCheck) {
         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
           return EFLAGS;
       }
       Opcode = X86ISD::OR;
       break;
     }
     }
 
     NumOperands = 2;
     break;
   case X86ISD::ADD:
   case X86ISD::SUB:
   case X86ISD::INC:
   case X86ISD::DEC:
   case X86ISD::OR:
   case X86ISD::XOR:
   case X86ISD::AND:
     return SDValue(Op.getNode(), 1);
   default:
   default_case:
     break;
   }
 
   // If we found that truncation is beneficial, perform the truncation and
   // update 'Op'.
   if (NeedTruncation) {
     EVT VT = Op.getValueType();
     SDValue WideVal = Op->getOperand(0);
     EVT WideVT = WideVal.getValueType();
     unsigned ConvertedOp = 0;
     // Use a target machine opcode to prevent further DAGCombine
     // optimizations that may separate the arithmetic operations
     // from the setcc node.
     switch (WideVal.getOpcode()) {
       default: break;
       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
       case ISD::AND: ConvertedOp = X86ISD::AND; break;
       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
     }
 
     if (ConvertedOp) {
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
       }
     }
   }
 
   if (Opcode == 0) {
     // Emit KTEST for bit vectors
     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
       return Node;
 
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
   }
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
 
   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   DAG.ReplaceAllUsesWith(Op, New);
   return SDValue(New.getNode(), 1);
 }
 
 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                                    const SDLoc &dl, SelectionDAG &DAG) const {
   if (isNullConstant(Op1))
     return EmitTest(Op0, X86CC, dl, DAG);
 
   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
          "Unexpected comparison operation for MVT::i1 operands");
 
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
     // Only promote the compare up to I32 if it is a 16 bit operation
     // with an immediate.  16 bit immediates are to be avoided.
     if ((Op0.getValueType() == MVT::i16 &&
          (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
         !DAG.getMachineFunction().getFunction().optForMinSize() &&
         !Subtarget.isAtom()) {
       unsigned ExtendOp =
           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
     }
     // Use SUB instead of CMP to enable CSE between SUB and CMP.
     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
     return SDValue(Sub.getNode(), 1);
   }
   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
 }
 
 /// Convert a comparison if required by the subtarget.
 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
                                                  SelectionDAG &DAG) const {
   // If the subtarget does not support the FUCOMI instruction, floating-point
   // comparisons have to be converted.
   if (Subtarget.hasCMov() ||
       Cmp.getOpcode() != X86ISD::CMP ||
       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
       !Cmp.getOperand(1).getValueType().isFloatingPoint())
     return Cmp;
 
   // The instruction selector will select an FUCOM instruction instead of
   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
   SDLoc dl(Cmp);
   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
                             DAG.getConstant(8, dl, MVT::i8));
   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
 
   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
 /// Check if replacement of SQRT with RSQRT should be disabled.
 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
   // We never want to use both SQRT and RSQRT instructions for the same input.
   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
     return false;
 
   if (VT.isVector())
     return Subtarget.hasFastVectorFSQRT();
   return Subtarget.hasFastScalarFSQRT();
 }
 
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
                                            SelectionDAG &DAG, int Enabled,
                                            int &RefinementSteps,
                                            bool &UseOneConstNR,
                                            bool Reciprocal) const {
   EVT VT = Op.getValueType();
 
   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   // instructions: convert to single, rsqrtss, convert back to double, refine
   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
   // after legalize types.
   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
       (VT == MVT::v8f32 && Subtarget.hasAVX())) {
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = 1;
 
     UseOneConstNR = false;
     return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
   }
   return SDValue();
 }
 
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
                                             int Enabled,
                                             int &RefinementSteps) const {
   EVT VT = Op.getValueType();
 
   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   // TODO: Add support for AVX512 (v16f32).
   // It is likely not profitable to do this for f64 because a double-precision
   // reciprocal estimate with refinement on x86 prior to FMA requires
   // 15 instructions: convert to single, rcpss, convert back to double, refine
   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
 
   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
       (VT == MVT::v8f32 && Subtarget.hasAVX())) {
     // Enable estimate codegen with 1 refinement step for vector division.
     // Scalar division estimates are disabled because they break too much
     // real-world code. These defaults are intended to match GCC behavior.
     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
       return SDValue();
 
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = 1;
 
     return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
   }
   return SDValue();
 }
 
 /// If we have at least two divisions that use the same divisor, convert to
 /// multiplication by a reciprocal. This may need to be adjusted for a given
 /// CPU if a division's cost is not at least twice the cost of a multiplication.
 /// This is because we still need one division to calculate the reciprocal and
 /// then we need two multiplies by that reciprocal as replacements for the
 /// original divisions.
 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
   return 2;
 }
 
 /// Helper for creating a X86ISD::SETCC node.
 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
                         SelectionDAG &DAG) {
   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                      DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
 }
 
 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
 /// according to equal/not-equal condition code \p CC.
 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
                                    const SDLoc &dl, SelectionDAG &DAG) {
   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
   // instruction.  Since the shift amount is in-range-or-undefined, we know
   // that doing a bittest on the i32 value is ok.  We extend to i32 because
   // the encoding for the i16 version is larger than the i32 version.
   // Also promote i16 to i32 for performance / code size reason.
   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
 
   // See if we can use the 32-bit instruction instead of the 64-bit one for a
   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
   // known to be zero.
   if (Src.getValueType() == MVT::i64 &&
       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
 
   // If the operand types disagree, extend the shift amount to match.  Since
   // BT ignores high bits (like shifts) we can use anyextend.
   if (Src.getValueType() != BitNo.getValueType())
     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
 
   SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
   X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
   return getSETCC(Cond, BT, dl , DAG);
 }
 
 /// Result of 'and' is compared against zero. Change to a BT node if possible.
 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
                             const SDLoc &dl, SelectionDAG &DAG) {
   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
   SDValue Op0 = And.getOperand(0);
   SDValue Op1 = And.getOperand(1);
   if (Op0.getOpcode() == ISD::TRUNCATE)
     Op0 = Op0.getOperand(0);
   if (Op1.getOpcode() == ISD::TRUNCATE)
     Op1 = Op1.getOperand(0);
 
   SDValue LHS, RHS;
   if (Op1.getOpcode() == ISD::SHL)
     std::swap(Op0, Op1);
   if (Op0.getOpcode() == ISD::SHL) {
     if (isOneConstant(Op0.getOperand(0))) {
       // If we looked past a truncate, check that it's only truncating away
       // known zeros.
       unsigned BitWidth = Op0.getValueSizeInBits();
       unsigned AndBitWidth = And.getValueSizeInBits();
       if (BitWidth > AndBitWidth) {
         KnownBits Known;
         DAG.computeKnownBits(Op0, Known);
         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
           return SDValue();
       }
       LHS = Op1;
       RHS = Op0.getOperand(1);
     }
   } else if (Op1.getOpcode() == ISD::Constant) {
     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
     uint64_t AndRHSVal = AndRHS->getZExtValue();
     SDValue AndLHS = Op0;
 
     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
       LHS = AndLHS.getOperand(0);
       RHS = AndLHS.getOperand(1);
     }
 
     // Use BT if the immediate can't be encoded in a TEST instruction.
     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
       LHS = AndLHS;
       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
     }
   }
 
   if (LHS.getNode())
     return getBitTestCondition(LHS, RHS, CC, dl, DAG);
 
   return SDValue();
 }
 
 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
 /// CMPs.
 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
                                    SDValue &Op1) {
   unsigned SSECC;
   bool Swap = false;
 
   // SSE Condition code mapping:
   //  0 - EQ
   //  1 - LT
   //  2 - LE
   //  3 - UNORD
   //  4 - NEQ
   //  5 - NLT
   //  6 - NLE
   //  7 - ORD
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETOEQ:
   case ISD::SETEQ:  SSECC = 0; break;
   case ISD::SETOGT:
   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETLT:
   case ISD::SETOLT: SSECC = 1; break;
   case ISD::SETOGE:
   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETLE:
   case ISD::SETOLE: SSECC = 2; break;
   case ISD::SETUO:  SSECC = 3; break;
   case ISD::SETUNE:
   case ISD::SETNE:  SSECC = 4; break;
   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETUGE: SSECC = 5; break;
   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETUGT: SSECC = 6; break;
   case ISD::SETO:   SSECC = 7; break;
   case ISD::SETUEQ: SSECC = 8; break;
   case ISD::SETONE: SSECC = 12; break;
   }
   if (Swap)
     std::swap(Op0, Op1);
 
   return SSECC;
 }
 
 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
 /// concatenate the result back.
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
   SDLoc dl(Op);
   SDValue CC = Op.getOperand(2);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
 
   // Issue the operation on the smaller types and concatenate the result back
   MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
 }
 
 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
          "Unexpected type for boolean compare operation");
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
                                DAG.getConstant(-1, dl, VT));
   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
                                DAG.getConstant(-1, dl, VT));
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETEQ:
     // (x == y) -> ~(x ^ y)
     return DAG.getNode(ISD::XOR, dl, VT,
                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
                        DAG.getConstant(-1, dl, VT));
   case ISD::SETNE:
     // (x != y) -> (x ^ y)
     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
   case ISD::SETUGT:
   case ISD::SETGT:
     // (x > y) -> (x & ~y)
     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
   case ISD::SETULT:
   case ISD::SETLT:
     // (x < y) -> (~x & y)
     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
   case ISD::SETULE:
   case ISD::SETLE:
     // (x <= y) -> (~x | y)
     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
   case ISD::SETUGE:
   case ISD::SETGE:
     // (x >=y) -> (x | ~y)
     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
   }
 }
 
 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
 
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   assert(VT.getVectorElementType() == MVT::i1 &&
          "Cannot set masked compare for this operation");
 
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   unsigned  Opc = 0;
   bool Unsigned = false;
   bool Swap = false;
   unsigned SSECC;
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETNE:  SSECC = 4; break;
   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
   case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
   case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
   case ISD::SETLE:  SSECC = 2; break;
   }
 
   if (Swap)
     std::swap(Op0, Op1);
 
   //  See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
   if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
     SDValue A = peekThroughBitcasts(Op0);
     if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
         ISD::isBuildVectorAllZeros(Op1.getNode())) {
       MVT VT0 = Op0.getSimpleValueType();
       SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
       SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
       return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
                          dl, VT, RHS, LHS);
     }
   }
 
   if (Opc)
     return DAG.getNode(Opc, dl, VT, Op0, Op1);
   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
   return DAG.getNode(Opc, dl, VT, Op0, Op1,
                      DAG.getConstant(SSECC, dl, MVT::i8));
 }
 
 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
 /// operand \p Op1.  If non-trivial (for example because it's not constant)
 /// return an empty value.
 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
                                       SelectionDAG &DAG) {
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   if (!BV)
     return SDValue();
 
   MVT VT = Op1.getSimpleValueType();
   MVT EVT = VT.getVectorElementType();
   unsigned n = VT.getVectorNumElements();
   SmallVector<SDValue, 8> ULTOp1;
 
   for (unsigned i = 0; i < n; ++i) {
     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
       return SDValue();
 
     // Avoid underflow.
     APInt Val = Elt->getAPIntValue();
     if (Val == 0)
       return SDValue();
 
     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
   }
 
   return DAG.getBuildVector(VT, dl, ULTOp1);
 }
 
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   SDLoc dl(Op);
 
   if (isFP) {
 #ifndef NDEBUG
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
     unsigned Opc;
     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
       assert(VT.getVectorNumElements() <= 16);
       Opc = X86ISD::CMPM;
     } else {
       Opc = X86ISD::CMPP;
       // The SSE/AVX packed FP comparison nodes are defined with a
       // floating-point vector result that matches the operand type. This allows
       // them to work with an SSE1 target (integer vector types are not legal).
       VT = Op0.getSimpleValueType();
     }
 
     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
     // emit two comparisons and a logic op to tie them together.
     SDValue Cmp;
     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
     if (SSECC >= 8 && !Subtarget.hasAVX()) {
       // LLVM predicate is SETUEQ or SETONE.
       unsigned CC0, CC1;
       unsigned CombineOpc;
       if (Cond == ISD::SETUEQ) {
         CC0 = 3; // UNORD
         CC1 = 0; // EQ
         CombineOpc = X86ISD::FOR;
       } else {
         assert(Cond == ISD::SETONE);
         CC0 = 7; // ORD
         CC1 = 4; // NEQ
         CombineOpc = X86ISD::FAND;
       }
 
       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC0, dl, MVT::i8));
       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC1, dl, MVT::i8));
       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
     } else {
       // Handle all other FP comparisons here.
       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
                         DAG.getConstant(SSECC, dl, MVT::i8));
     }
 
     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
     // result type of SETCC. The bitcast is expected to be optimized away
     // during combining/isel.
     if (Opc == X86ISD::CMPP)
       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
 
     return Cmp;
   }
 
   MVT VTOp0 = Op0.getSimpleValueType();
   assert(VTOp0 == Op1.getSimpleValueType() &&
          "Expected operands with same type!");
   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
          "Invalid number of packed elements for source and destination!");
 
   if (VT.is128BitVector() && VTOp0.is256BitVector()) {
     // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
     // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
     // legalizer firstly checks if the first operand in input to the setcc has
     // a legal type. If so, then it promotes the return type to that same type.
     // Otherwise, the return type is promoted to the 'next legal type' which,
     // for a vector of MVT::i1 is always a 128-bit integer vector type.
     //
     // We reach this code only if the following two conditions are met:
     // 1. Both return type and operand type have been promoted to wider types
     //    by the type legalizer.
     // 2. The original operand type has been promoted to a 256-bit vector.
     //
     // Note that condition 2. only applies for AVX targets.
     SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
     return DAG.getZExtOrTrunc(NewOp, dl, VT);
   }
 
   // The non-AVX512 code below works under the assumption that source and
   // destination types are the same.
   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
          "Value types for source and destination must be the same!");
 
   // Break 256-bit integer vector compare into smaller ones.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntVSETCC(Op, DAG);
 
   // Operands are boolean (vectors of i1)
   MVT OpVT = Op1.getSimpleValueType();
   if (OpVT.getVectorElementType() == MVT::i1)
     return LowerBoolVSETCC_AVX512(Op, DAG);
 
   // The result is boolean, but operands are int/float
   if (VT.getVectorElementType() == MVT::i1) {
     // In AVX-512 architecture setcc returns mask with i1 elements,
     // But there is no compare instruction for i8 and i16 elements in KNL.
     // In this case use SSE compare
     bool UseAVX512Inst =
       (OpVT.is512BitVector() ||
        OpVT.getScalarSizeInBits() >= 32 ||
        (Subtarget.hasBWI() && Subtarget.hasVLX()));
 
     if (UseAVX512Inst)
       return LowerIntVSETCC_AVX512(Op, DAG);
 
     return DAG.getNode(ISD::TRUNCATE, dl, VT,
                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
   }
 
   // Lower using XOP integer comparisons.
   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
     // Translate compare code to XOP PCOM compare mode.
     unsigned CmpMode = 0;
     switch (Cond) {
     default: llvm_unreachable("Unexpected SETCC condition");
     case ISD::SETULT:
     case ISD::SETLT: CmpMode = 0x00; break;
     case ISD::SETULE:
     case ISD::SETLE: CmpMode = 0x01; break;
     case ISD::SETUGT:
     case ISD::SETGT: CmpMode = 0x02; break;
     case ISD::SETUGE:
     case ISD::SETGE: CmpMode = 0x03; break;
     case ISD::SETEQ: CmpMode = 0x04; break;
     case ISD::SETNE: CmpMode = 0x05; break;
     }
 
     // Are we comparing unsigned or signed integers?
     unsigned Opc =
         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
 
     return DAG.getNode(Opc, dl, VT, Op0, Op1,
                        DAG.getConstant(CmpMode, dl, MVT::i8));
   }
 
   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
     SDValue BC0 = peekThroughBitcasts(Op0);
     if (BC0.getOpcode() == ISD::AND) {
       APInt UndefElts;
       SmallVector<APInt, 64> EltBits;
       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
                                         VT.getScalarSizeInBits(), UndefElts,
                                         EltBits, false, false)) {
         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
           Cond = ISD::SETEQ;
           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
         }
       }
     }
   }
 
   // We are handling one of the integer comparisons here. Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
                                                             : X86ISD::PCMPGT;
   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
               Cond == ISD::SETGE || Cond == ISD::SETUGE;
   bool Invert = Cond == ISD::SETNE ||
                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
 
   // If both operands are known non-negative, then an unsigned compare is the
   // same as a signed compare and there's no need to flip signbits.
   // TODO: We could check for more general simplifications here since we're
   // computing known bits.
   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
 
   // Special case: Use min/max operations for SETULE/SETUGE
   MVT VET = VT.getVectorElementType();
   bool HasMinMax =
       (Subtarget.hasAVX512() && VET == MVT::i64) ||
       (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
       (Subtarget.hasSSE2() && (VET == MVT::i8));
   bool MinMax = false;
   if (HasMinMax) {
     switch (Cond) {
     default: break;
     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
     }
 
     if (MinMax)
       Swap = Invert = FlipSigns = false;
   }
 
   bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
   bool Subus = false;
   if (!MinMax && HasSubus) {
     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
     // Op0 u<= Op1:
     //   t = psubus Op0, Op1
     //   pcmpeq t, <0..0>
     switch (Cond) {
     default: break;
     case ISD::SETULT: {
       // If the comparison is against a constant we can turn this into a
       // setule.  With psubus, setule does not require a swap.  This is
       // beneficial because the constant in the register is no longer
       // destructed as the destination so it can be hoisted out of a loop.
       // Only do this pre-AVX since vpcmp* is no longer destructive.
       if (Subtarget.hasAVX())
         break;
       if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
         Op1 = ULEOp1;
         Subus = true; Invert = false; Swap = false;
       }
       break;
     }
     // Psubus is better than flip-sign because it requires no inversion.
     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
     }
 
     if (Subus) {
       Opc = X86ISD::SUBUS;
       FlipSigns = false;
     }
   }
 
   if (Swap)
     std::swap(Op0, Op1);
 
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
   if (VT == MVT::v2i64) {
     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
 
       // First cast everything to the right type.
       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
 
       // Since SSE has no unsigned integer comparisons, we need to flip the sign
       // bits of the inputs before performing those operations. The lower
       // compare is always unsigned.
       SDValue SB;
       if (FlipSigns) {
         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
       } else {
         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
         SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
       }
       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
 
       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
 
       // Create masks for only the low parts/high parts of the 64 bit integers.
       static const int MaskHi[] = { 1, 1, 3, 3 };
       static const int MaskLo[] = { 0, 0, 2, 2 };
       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
 
       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
 
       if (Invert)
         Result = DAG.getNOT(dl, Result, MVT::v4i32);
 
       return DAG.getBitcast(VT, Result);
     }
 
     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
       // pcmpeqd + pshufd + pand.
       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
 
       // First cast everything to the right type.
       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
 
       // Do the compare.
       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
 
       // Make sure the lower and upper halves are both all-ones.
       static const int Mask[] = { 1, 0, 3, 2 };
       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
 
       if (Invert)
         Result = DAG.getNOT(dl, Result, MVT::v4i32);
 
       return DAG.getBitcast(VT, Result);
     }
   }
 
   // Since SSE has no unsigned integer comparisons, we need to flip the sign
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
     MVT EltVT = VT.getVectorElementType();
     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
                                  VT);
     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
   }
 
   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
 
   // If the logical-not of the result is required, perform that now.
   if (Invert)
     Result = DAG.getNOT(dl, Result, VT);
 
   if (MinMax)
     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
 
   if (Subus)
     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
                          getZeroVector(VT, Subtarget, DAG, dl));
 
   return Result;
 }
 
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDLoc dl(Op);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
   // Optimize to BT if possible.
   // Lower (X & (1 << N)) == 0 to BT(X, N).
   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
       return NewSetCC;
   }
 
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   // these.
   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
 
     // If the input is a setcc, then reuse the input setcc or use a new one with
     // the inverted condition.
     if (Op0.getOpcode() == X86ISD::SETCC) {
       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
       if (!Invert)
         return Op0;
 
       CCode = X86::GetOppositeBranchCondition(CCode);
       return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
     }
   }
 
   bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
   X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
   if (X86CC == X86::COND_INVALID)
     return SDValue();
 
   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   return getSETCC(X86CC, EFLAGS, dl, DAG);
 }
 
 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Carry = Op.getOperand(2);
   SDValue Cond = Op.getOperand(3);
   SDLoc DL(Op);
 
   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
 
   // Recreate the carry if needed.
   EVT CarryVT = Carry.getValueType();
   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
 
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
 }
 
 /// Return true if opcode is a X86 logical comparison.
 static bool isX86LogicalCmp(SDValue Op) {
   unsigned Opc = Op.getOpcode();
   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
       Opc == X86ISD::SAHF)
     return true;
   if (Op.getResNo() == 1 &&
       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
        Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
        Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
        Opc == X86ISD::XOR || Opc == X86ISD::AND))
     return true;
 
   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
     return true;
 
   return false;
 }
 
 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
     return false;
 
   SDValue VOp0 = V.getOperand(0);
   unsigned InBits = VOp0.getValueSizeInBits();
   unsigned Bits = V.getValueSizeInBits();
   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
 }
 
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   bool AddTest = true;
   SDValue Cond  = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
   SDLoc DL(Op);
   MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   // are available or VBLENDV if AVX is available.
   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   if (Cond.getOpcode() == ISD::SETCC &&
       ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
     unsigned SSECC = translateX86FSETCC(
         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
 
     if (Subtarget.hasAVX512()) {
       SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
                                 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
       assert(!VT.isVector() && "Not a scalar type?");
       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
     }
 
     if (SSECC < 8 || Subtarget.hasAVX()) {
       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
                                 DAG.getConstant(SSECC, DL, MVT::i8));
 
       // If we have AVX, we can use a variable vector select (VBLENDV) instead
       // of 3 logic instructions for size savings and potentially speed.
       // Unfortunately, there is no scalar form of VBLENDV.
 
       // If either operand is a constant, don't try this. We can expect to
       // optimize away at least one of the logic instructions later in that
       // case, so that sequence would be faster than a variable blend.
 
       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
       // uses XMM0 as the selection register. That may need just as many
       // instructions as the AND/ANDN/OR sequence due to register moves, so
       // don't bother.
 
       if (Subtarget.hasAVX() &&
           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
 
         // Convert to vectors, do a VSELECT, and convert back to scalar.
         // All of the conversions should be optimized away.
 
         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
 
         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
         VCmp = DAG.getBitcast(VCmpVT, VCmp);
 
         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
 
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
                            VSel, DAG.getIntPtrConstant(0, DL));
       }
       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
     }
   }
 
   // AVX512 fallback is to lower selects of scalar floats to masked moves.
   if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
   }
 
   // For v64i1 without 64-bit support we need to split and rejoin.
   if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
     assert(Subtarget.hasBWI() && "Expected BWI to be legal");
     SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
     SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
     SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
     SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
     SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
     SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   }
 
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
     SDValue Op1Scalar;
     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
       Op1Scalar = Op1.getOperand(0);
     SDValue Op2Scalar;
     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
       Op2Scalar = Op2.getOperand(0);
     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
       SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
                                         Op1Scalar, Op2Scalar);
       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
         return DAG.getBitcast(VT, newSelect);
       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
                          DAG.getIntPtrConstant(0, DL));
     }
   }
 
   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
     SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
   }
 
   if (Cond.getOpcode() == ISD::SETCC) {
     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
       Cond = NewCond;
       // If the condition was updated, it's possible that the operands of the
       // select were also updated (for example, EmitTest has a RAUW). Refresh
       // the local references to the select operands in case they got stale.
       Op1 = Op.getOperand(1);
       Op2 = Op.getOperand(2);
     }
   }
 
   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
   if (Cond.getOpcode() == X86ISD::SETCC &&
       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
       isNullConstant(Cond.getOperand(1).getOperand(1))) {
     SDValue Cmp = Cond.getOperand(1);
     unsigned CondCode =
         cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
 
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
       SDValue CmpOp0 = Cmp.getOperand(0);
 
       // Apply further optimizations for special cases
       // (select (x != 0), -1, 0) -> neg & sbb
       // (select (x == 0), 0, -1) -> neg & sbb
       if (isNullConstant(Y) &&
           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
         SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
         SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
                                   SDValue(Neg.getNode(), 1));
         return Res;
       }
 
       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
 
       SDValue Res =   // Res = 0 or -1.
         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
 
       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
         Res = DAG.getNOT(DL, Res, Res.getValueType());
 
       if (!isNullConstant(Op2))
         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
       return Res;
     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
                Cmp.getOperand(0).getOpcode() == ISD::AND &&
                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
       SDValue CmpOp0 = Cmp.getOperand(0);
       SDValue Src1, Src2;
       // true if Op2 is XOR or OR operator and one of its operands
       // is equal to Op1
       // ( a , a op b) || ( b , a op b)
       auto isOrXorPattern = [&]() {
         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
           Src1 =
               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
           Src2 = Op1;
           return true;
         }
         return false;
       };
 
       if (isOrXorPattern()) {
         SDValue Neg;
         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
         // we need mask of all zeros or ones with same size of the other
         // operands.
         if (CmpSz > VT.getSizeInBits())
           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
         else if (CmpSz < VT.getSizeInBits())
           Neg = DAG.getNode(ISD::AND, DL, VT,
               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
               DAG.getConstant(1, DL, VT));
         else
           Neg = CmpOp0;
         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                                    Neg); // -(and (x, 0x1))
         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
       }
     }
   }
 
   // Look past (and (setcc_carry (cmp ...)), 1).
   if (Cond.getOpcode() == ISD::AND &&
       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
       isOneConstant(Cond.getOperand(1)))
     Cond = Cond.getOperand(0);
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
   unsigned CondOpcode = Cond.getOpcode();
   if (CondOpcode == X86ISD::SETCC ||
       CondOpcode == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
     MVT VT = Op.getSimpleValueType();
 
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
         !isScalarFPTypeInSSEReg(VT))  // FPStack?
       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
 
     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
         Opc == X86ISD::BT) { // FIXME
       Cond = Cmp;
       AddTest = false;
     }
   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
               Cond.getOperand(0).getValueType() != MVT::i8)) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
     unsigned X86Opcode;
     unsigned X86Cond;
     SDVTList VTs;
     switch (CondOpcode) {
     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
     default: llvm_unreachable("unexpected overflowing operator");
     }
     if (CondOpcode == ISD::UMULO)
       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
                           MVT::i32);
     else
       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
 
     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
 
     if (CondOpcode == ISD::UMULO)
       Cond = X86Op.getValue(2);
     else
       Cond = X86Op.getValue(1);
 
     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
     AddTest = false;
   }
 
   if (AddTest) {
     // Look past the truncate if the high bits are known zero.
     if (isTruncWithZeroHighBitsInput(Cond, DAG))
       Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
       if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
         CC = NewSetCC.getOperand(0);
         Cond = NewSetCC.getOperand(1);
         AddTest = false;
       }
     }
   }
 
   if (AddTest) {
     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
   }
 
   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   // a <  b ?  0 : -1 -> RES = setcc_carry
   // a >= b ? -1 :  0 -> RES = setcc_carry
   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   if (Cond.getOpcode() == X86ISD::SUB) {
     Cond = ConvertCmpIfNecessary(Cond, DAG);
     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
 
     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (isNullConstant(Op1) || isNullConstant(Op2))) {
       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
                                 Cond);
       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
         return DAG.getNOT(DL, Res, Res.getValueType());
       return Res;
     }
   }
 
   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   // widen the cmov and push the truncate through. This avoids introducing a new
   // branch during isel and doesn't add any extensions.
   if (Op.getValueType() == MVT::i8 &&
       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
     if (T1.getValueType() == T2.getValueType() &&
         // Blacklist CopyFromReg to avoid partial register stalls.
         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
                                  CC, Cond);
       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
     }
   }
 
   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
   SDValue Ops[] = { Op2, Op1, CC, Cond };
   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
 }
 
 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
                                      const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
   MVT VTElt = VT.getVectorElementType();
   SDLoc dl(Op);
 
   unsigned NumElts = VT.getVectorNumElements();
 
   // Extend VT if the scalar type is v8/v16 and BWI is not supported.
   MVT ExtVT = VT;
   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
 
   // Widen to 512-bits if VLX is not supported.
   MVT WideVT = ExtVT;
   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
     NumElts *= 512 / ExtVT.getSizeInBits();
     InVT = MVT::getVectorVT(MVT::i1, NumElts);
     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
                      In, DAG.getIntPtrConstant(0, dl));
     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
   }
 
   SDValue V;
   MVT WideEltVT = WideVT.getVectorElementType();
   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
     V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
   } else {
     SDValue NegOne = getOnesVector(WideVT, DAG, dl);
     SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
   }
 
   // Truncate if we had to extend i16/i8 above.
   if (VT != ExtVT) {
     WideVT = MVT::getVectorVT(VTElt, NumElts);
     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
   }
 
   // Extract back to 128/256-bit if we widened.
   if (WideVT != VT)
     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
                     DAG.getIntPtrConstant(0, dl));
 
   return V;
 }
 
 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
 
   if (InVT.getVectorElementType() == MVT::i1)
     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
 
   if (Subtarget.hasFp256())
     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
       return Res;
 
   return SDValue();
 }
 
 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
 // For sign extend this needs to handle all vector sizes and SSE4.1 and
 // non-SSE4.1 targets. For zero extend this should only handle inputs of
 // MVT::v64i8 when BWI is not supported, but AVX512 is.
 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
   MVT VT = Op->getSimpleValueType(0);
   MVT InVT = In.getSimpleValueType();
   assert(VT.getSizeInBits() == InVT.getSizeInBits());
 
   MVT SVT = VT.getVectorElementType();
   MVT InSVT = InVT.getVectorElementType();
   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
 
   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
     return SDValue();
   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
     return SDValue();
   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
       !(VT.is256BitVector() && Subtarget.hasInt256()) &&
       !(VT.is512BitVector() && Subtarget.hasAVX512()))
     return SDValue();
 
   SDLoc dl(Op);
 
   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
   // For 512-bit vectors, we need 128-bits or 256-bits.
   if (VT.getSizeInBits() > 128) {
     // Input needs to be at least the same number of elements as output, and
     // at least 128-bits.
     int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
   }
 
   assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
           InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
 
   // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
   // need to be handled here for 256/512-bit results.
   if (Subtarget.hasInt256()) {
     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
     unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
                         X86ISD::VSEXT : X86ISD::VZEXT;
     return DAG.getNode(ExtOpc, dl, VT, In);
   }
 
   // We should only get here for sign extend.
   assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
          "Unexpected opcode!");
 
   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
   SDValue Curr = In;
   MVT CurrVT = InVT;
 
   // As SRAI is only available on i16/i32 types, we expand only up to i32
   // and handle i64 separately.
   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
     Curr = DAG.getBitcast(CurrVT, Curr);
   }
 
   SDValue SignExt = Curr;
   if (CurrVT != InVT) {
     unsigned SignExtShift =
         CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
                           DAG.getConstant(SignExtShift, dl, MVT::i8));
   }
 
   if (CurrVT == VT)
     return SignExt;
 
   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
                                DAG.getConstant(31, dl, MVT::i8));
     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
     return DAG.getBitcast(VT, Ext);
   }
 
   return SDValue();
 }
 
 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
   if (InVT.getVectorElementType() == MVT::i1)
     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
 
   if ((VT != MVT::v4i64  || InVT != MVT::v4i32) &&
       (VT != MVT::v8i32  || InVT != MVT::v8i16) &&
       (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
       (VT != MVT::v8i64  || InVT != MVT::v8i32) &&
       (VT != MVT::v8i64  || InVT != MVT::v8i16) &&
       (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
       (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
       (VT != MVT::v32i16 || InVT != MVT::v32i8))
     return SDValue();
 
   if (Subtarget.hasInt256())
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
 
   // Optimize vectors in AVX mode
   // Sign extend  v8i16 to v8i32 and
   //              v4i32 to v4i64
   //
   // Divide input vector into two parts
   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   // concat the vectors to original VT
 
   unsigned NumElems = InVT.getVectorNumElements();
   SDValue Undef = DAG.getUNDEF(InVT);
 
   SmallVector<int,8> ShufMask1(NumElems, -1);
   for (unsigned i = 0; i != NumElems/2; ++i)
     ShufMask1[i] = i;
 
   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
 
   SmallVector<int,8> ShufMask2(NumElems, -1);
   for (unsigned i = 0; i != NumElems/2; ++i)
     ShufMask2[i] = i + NumElems/2;
 
   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
 
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
 
   OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
   OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
 // Lower truncating store. We need a special lowering to vXi1 vectors
 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
   StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
   SDLoc dl(St);
   EVT MemVT = St->getMemoryVT();
   assert(St->isTruncatingStore() && "We only custom truncating store.");
   assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
          "Expected truncstore of i1 vector");
 
   SDValue Op = St->getValue();
   MVT OpVT = Op.getValueType().getSimpleVT();
   unsigned NumElts = OpVT.getVectorNumElements();
   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
       NumElts == 16) {
     // Truncate and store - everything is legal
     Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
     if (MemVT.getSizeInBits() < 8)
       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
                        DAG.getUNDEF(MVT::v8i1), Op,
                        DAG.getIntPtrConstant(0, dl));
     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
                         St->getMemOperand());
   }
 
   // A subset, assume that we have only AVX-512F
   if (NumElts <= 8) {
     if (NumElts < 8) {
       // Extend to 8-elts vector
       MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
                         DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
     }
     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
     Op = DAG.getBitcast(MVT::i8, Op);
     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
                         St->getMemOperand());
   }
   // v32i8
   assert(OpVT == MVT::v32i8 && "Unexpected operand type");
   // Divide the vector into 2 parts and store each part separately
   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
                             DAG.getIntPtrConstant(0, dl));
   Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
   SDValue BasePtr = St->getBasePtr();
   SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
                               St->getMemOperand());
   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
                             DAG.getIntPtrConstant(16, dl));
   Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
 
   SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
 
   SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
                               BasePtrHi, St->getPointerInfo().getWithOffset(2),
                               MinAlign(St->getAlignment(), 2U),
                               St->getMemOperand()->getFlags());
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
 }
 
 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
 
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
   EVT MemVT = Ld->getMemoryVT();
   assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
          "Expected i1 vector load");
   unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
     ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
   MVT VT = Op.getValueType().getSimpleVT();
   unsigned NumElts = VT.getVectorNumElements();
 
   if ((Subtarget.hasBWI() && NumElts >= 32) ||
       (Subtarget.hasDQI() && NumElts < 16) ||
       NumElts == 16) {
     // Load and extend - everything is legal
     if (NumElts < 8) {
       SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
                                  Ld->getBasePtr(),
                                  Ld->getMemOperand());
       // Replace chain users with the new chain.
       assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
       if (Subtarget.hasVLX()) {
         // Extract to v4i1/v2i1.
         SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
                                       DAG.getIntPtrConstant(0, dl));
         // Finally, do a normal sign-extend to the desired register.
         return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
       }
 
       MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
       SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
 
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                                    DAG.getIntPtrConstant(0, dl));
     }
     SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
                                Ld->getBasePtr(),
                                Ld->getMemOperand());
     // Replace chain users with the new chain.
     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
 
     // Finally, do a normal sign-extend to the desired register.
     return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
   }
 
   if (NumElts <= 8) {
     // A subset, assume that we have only AVX-512F
     SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
                               Ld->getBasePtr(),
                               Ld->getMemOperand());
     // Replace chain users with the new chain.
     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
 
     SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);
 
     if (NumElts == 8)
       return DAG.getNode(ExtOpcode, dl, VT, BitVec);
 
     if (Subtarget.hasVLX()) {
       // Extract to v4i1/v2i1.
       SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
                                     DAG.getIntPtrConstant(0, dl));
       // Finally, do a normal sign-extend to the desired register.
       return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
     }
 
     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
     SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                         DAG.getIntPtrConstant(0, dl));
   }
 
   assert(VT == MVT::v32i8 && "Unexpected extload type");
 
   SDValue BasePtr = Ld->getBasePtr();
   SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
                                Ld->getBasePtr(),
                                Ld->getMemOperand());
 
   SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);
 
   SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
                                Ld->getPointerInfo().getWithOffset(2),
                                MinAlign(Ld->getAlignment(), 2U),
                                Ld->getMemOperand()->getFlags());
 
   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                                  LoadLo.getValue(1), LoadHi.getValue(1));
   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
 
   SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
   SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
 }
 
 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
 // may emit an illegal shuffle but the expansion is still better than scalar
 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
 // we'll emit a shuffle and a arithmetic shift.
 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
 // TODO: It is possible to support ZExt by zeroing the undef values during
 // the shuffle phase or after the shuffle.
 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   MVT RegVT = Op.getSimpleValueType();
   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
   assert(RegVT.isInteger() &&
          "We only custom lower integer vector sext loads.");
 
   // Nothing useful we can do without SSE2 shuffles.
   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
 
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
   EVT MemVT = Ld->getMemoryVT();
   if (MemVT.getScalarType() == MVT::i1)
     return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned RegSz = RegVT.getSizeInBits();
 
   ISD::LoadExtType Ext = Ld->getExtensionType();
 
   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
          && "Only anyext and sext are currently implemented.");
   assert(MemVT != RegVT && "Cannot extend to the same type");
   assert(MemVT.isVector() && "Must load a vector from memory");
 
   unsigned NumElems = RegVT.getVectorNumElements();
   unsigned MemSz = MemVT.getSizeInBits();
   assert(RegSz > MemSz && "Register size must be greater than the mem size");
 
   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
     // The only way in which we have a legal 256-bit vector result but not the
     // integer 256-bit operations needed to directly lower a sextload is if we
     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
     // a 128-bit vector and a normal sign_extend to 256-bits that should get
     // correctly legalized. We do this late to allow the canonical form of
     // sextload to persist throughout the rest of the DAG combiner -- it wants
     // to fold together any extensions it can, and so will fuse a sign_extend
     // of an sextload into a sextload targeting a wider value.
     SDValue Load;
     if (MemSz == 128) {
       // Just switch this to a normal load.
       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
                                        "it must be a legal 128-bit vector "
                                        "type!");
       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
                          Ld->getPointerInfo(), Ld->getAlignment(),
                          Ld->getMemOperand()->getFlags());
     } else {
       assert(MemSz < 128 &&
              "Can't extend a type wider than 128 bits to a 256 bit vector!");
       // Do an sext load to a 128-bit vector type. We want to use the same
       // number of elements, but elements half as wide. This will end up being
       // recursively lowered by this routine, but will succeed as we definitely
       // have all the necessary features if we're using AVX1.
       EVT HalfEltVT =
           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
       Load =
           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
                          Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
                          Ld->getMemOperand()->getFlags());
     }
 
     // Replace chain users with the new chain.
     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
 
     // Finally, do a normal sign-extend to the desired register.
     return DAG.getSExtOrTrunc(Load, dl, RegVT);
   }
 
   // All sizes must be a power of two.
   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
          "Non-power-of-two elements are not custom lowered!");
 
   // Attempt to load the original value using scalar loads.
   // Find the largest scalar type that divides the total loaded size.
   MVT SclrLoadTy = MVT::i8;
   for (MVT Tp : MVT::integer_valuetypes()) {
     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
       SclrLoadTy = Tp;
     }
   }
 
   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
       (64 <= MemSz))
     SclrLoadTy = MVT::f64;
 
   // Calculate the number of scalar loads that we need to perform
   // in order to load our vector from memory.
   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
 
   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
          "Can only lower sext loads with a single scalar load!");
 
   unsigned loadRegZize = RegSz;
   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
     loadRegZize = 128;
 
   // If we don't have BWI we won't be able to create the shuffle needed for
   // v8i8->v8i64.
   if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
       MemVT == MVT::v8i8)
     loadRegZize = 128;
 
   // Represent our vector as a sequence of elements which are the
   // largest scalar that we can load.
   EVT LoadUnitVecVT = EVT::getVectorVT(
       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
 
   // Represent the data using the same element type that is stored in
   // memory. In practice, we ''widen'' MemVT.
   EVT WideVecVT =
       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                        loadRegZize / MemVT.getScalarSizeInBits());
 
   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
          "Invalid vector type");
 
   // We can't shuffle using an illegal type.
   assert(TLI.isTypeLegal(WideVecVT) &&
          "We only lower types that form legal widened vector types");
 
   SmallVector<SDValue, 8> Chains;
   SDValue Ptr = Ld->getBasePtr();
   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
                                       TLI.getPointerTy(DAG.getDataLayout()));
   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
 
   for (unsigned i = 0; i < NumLoads; ++i) {
     // Perform a single load.
     SDValue ScalarLoad =
         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
     Chains.push_back(ScalarLoad.getValue(1));
     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
     // another round of DAGCombining.
     if (i == 0)
       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
     else
       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
 
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
   }
 
   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 
   // Bitcast the loaded value to a vector of the original element type, in
   // the size of the target vector type.
   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
   unsigned SizeRatio = RegSz / MemSz;
 
   if (Ext == ISD::SEXTLOAD) {
     // If we have SSE4.1, we can directly emit a VSEXT node.
     if (Subtarget.hasSSE41()) {
       SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
       return Sext;
     }
 
     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
     // lanes.
     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
 
     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
     return Shuff;
   }
 
   if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
       MemVT == MVT::v8i8) {
     SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
     return Sext;
   }
 
   // Redistribute the loaded elements into the different locations.
   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   for (unsigned i = 0; i != NumElems; ++i)
     ShuffleVec[i * SizeRatio] = i;
 
   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
 
   // Bitcast to the requested type.
   Shuff = DAG.getBitcast(RegVT, Shuff);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
   return Shuff;
 }
 
 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
 /// each of which has no other use apart from the AND / OR.
 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   Opc = Op.getOpcode();
   if (Opc != ISD::OR && Opc != ISD::AND)
     return false;
   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
           Op.getOperand(0).hasOneUse() &&
           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
           Op.getOperand(1).hasOneUse());
 }
 
 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
 /// SETCC node has a single use.
 static bool isXor1OfSetCC(SDValue Op) {
   if (Op.getOpcode() != ISD::XOR)
     return false;
   if (isOneConstant(Op.getOperand(1)))
     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
            Op.getOperand(0).hasOneUse();
   return false;
 }
 
 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   bool addTest = true;
   SDValue Chain = Op.getOperand(0);
   SDValue Cond  = Op.getOperand(1);
   SDValue Dest  = Op.getOperand(2);
   SDLoc dl(Op);
   SDValue CC;
   bool Inverted = false;
 
   if (Cond.getOpcode() == ISD::SETCC) {
     // Check for setcc([su]{add,sub,mul}o == 0).
     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
         isNullConstant(Cond.getOperand(1)) &&
         Cond.getOperand(0).getResNo() == 1 &&
         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
       Inverted = true;
       Cond = Cond.getOperand(0);
     } else {
       if (SDValue NewCond = LowerSETCC(Cond, DAG))
         Cond = NewCond;
     }
   }
 #if 0
   // FIXME: LowerXALUO doesn't handle these!!
   else if (Cond.getOpcode() == X86ISD::ADD  ||
            Cond.getOpcode() == X86ISD::SUB  ||
            Cond.getOpcode() == X86ISD::SMUL ||
            Cond.getOpcode() == X86ISD::UMUL)
     Cond = LowerXALUO(Cond, DAG);
 #endif
 
   // Look pass (and (setcc_carry (cmp ...)), 1).
   if (Cond.getOpcode() == ISD::AND &&
       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
       isOneConstant(Cond.getOperand(1)))
     Cond = Cond.getOperand(0);
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
   unsigned CondOpcode = Cond.getOpcode();
   if (CondOpcode == X86ISD::SETCC ||
       CondOpcode == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
       Cond = Cmp;
       addTest = false;
     } else {
       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
       default: break;
       case X86::COND_O:
       case X86::COND_B:
         // These can only come from an arithmetic instruction with overflow,
         // e.g. SADDO, UADDO.
         Cond = Cond.getOperand(1);
         addTest = false;
         break;
       }
     }
   }
   CondOpcode = Cond.getOpcode();
   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
        Cond.getOperand(0).getValueType() != MVT::i8)) {
     SDValue LHS = Cond.getOperand(0);
     SDValue RHS = Cond.getOperand(1);
     unsigned X86Opcode;
     unsigned X86Cond;
     SDVTList VTs;
     // Keep this in sync with LowerXALUO, otherwise we might create redundant
     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
     // X86ISD::INC).
     switch (CondOpcode) {
     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
     case ISD::SADDO:
       if (isOneConstant(RHS)) {
           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
           break;
         }
       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
     case ISD::SSUBO:
       if (isOneConstant(RHS)) {
           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
           break;
         }
       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
     default: llvm_unreachable("unexpected overflowing operator");
     }
     if (Inverted)
       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
     if (CondOpcode == ISD::UMULO)
       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
                           MVT::i32);
     else
       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
 
     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
 
     if (CondOpcode == ISD::UMULO)
       Cond = X86Op.getValue(2);
     else
       Cond = X86Op.getValue(1);
 
     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
     addTest = false;
   } else {
     unsigned CondOpc;
     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
       SDValue Cmp = Cond.getOperand(0).getOperand(1);
       if (CondOpc == ISD::OR) {
         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
         // two branches instead of an explicit OR instruction with a
         // separate test.
         if (Cmp == Cond.getOperand(1).getOperand(1) &&
             isX86LogicalCmp(Cmp)) {
           CC = Cond.getOperand(0).getOperand(0);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
           CC = Cond.getOperand(1).getOperand(0);
           Cond = Cmp;
           addTest = false;
         }
       } else { // ISD::AND
         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
         // two branches instead of an explicit AND instruction with a
         // separate test. However, we only do this if this block doesn't
         // have a fall-through edge, because this requires an explicit
         // jmp when the condition is false.
         if (Cmp == Cond.getOperand(1).getOperand(1) &&
             isX86LogicalCmp(Cmp) &&
             Op.getNode()->hasOneUse()) {
           X86::CondCode CCode =
             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
           CCode = X86::GetOppositeBranchCondition(CCode);
           CC = DAG.getConstant(CCode, dl, MVT::i8);
           SDNode *User = *Op.getNode()->use_begin();
           // Look for an unconditional branch following this conditional branch.
           // We need this because we need to reverse the successors in order
           // to implement FCMP_OEQ.
           if (User->getOpcode() == ISD::BR) {
             SDValue FalseBB = User->getOperand(1);
             SDNode *NewBR =
               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
             assert(NewBR == User);
             (void)NewBR;
             Dest = FalseBB;
 
             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                                 Chain, Dest, CC, Cmp);
             X86::CondCode CCode =
               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
             CCode = X86::GetOppositeBranchCondition(CCode);
             CC = DAG.getConstant(CCode, dl, MVT::i8);
             Cond = Cmp;
             addTest = false;
           }
         }
       }
     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
       // It should be transformed during dag combiner except when the condition
       // is set by a arithmetics with overflow node.
       X86::CondCode CCode =
         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
       CCode = X86::GetOppositeBranchCondition(CCode);
       CC = DAG.getConstant(CCode, dl, MVT::i8);
       Cond = Cond.getOperand(0).getOperand(1);
       addTest = false;
     } else if (Cond.getOpcode() == ISD::SETCC &&
                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
       // For FCMP_OEQ, we can emit
       // two branches instead of an explicit AND instruction with a
       // separate test. However, we only do this if this block doesn't
       // have a fall-through edge, because this requires an explicit
       // jmp when the condition is false.
       if (Op.getNode()->hasOneUse()) {
         SDNode *User = *Op.getNode()->use_begin();
         // Look for an unconditional branch following this conditional branch.
         // We need this because we need to reverse the successors in order
         // to implement FCMP_OEQ.
         if (User->getOpcode() == ISD::BR) {
           SDValue FalseBB = User->getOperand(1);
           SDNode *NewBR =
             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
           assert(NewBR == User);
           (void)NewBR;
           Dest = FalseBB;
 
           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                     Cond.getOperand(0), Cond.getOperand(1));
           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
           Cond = Cmp;
           addTest = false;
         }
       }
     } else if (Cond.getOpcode() == ISD::SETCC &&
                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
       // For FCMP_UNE, we can emit
       // two branches instead of an explicit AND instruction with a
       // separate test. However, we only do this if this block doesn't
       // have a fall-through edge, because this requires an explicit
       // jmp when the condition is false.
       if (Op.getNode()->hasOneUse()) {
         SDNode *User = *Op.getNode()->use_begin();
         // Look for an unconditional branch following this conditional branch.
         // We need this because we need to reverse the successors in order
         // to implement FCMP_UNE.
         if (User->getOpcode() == ISD::BR) {
           SDValue FalseBB = User->getOperand(1);
           SDNode *NewBR =
             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
           assert(NewBR == User);
           (void)NewBR;
 
           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                     Cond.getOperand(0), Cond.getOperand(1));
           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
           Cond = Cmp;
           addTest = false;
           Dest = FalseBB;
         }
       }
     }
   }
 
   if (addTest) {
     // Look pass the truncate if the high bits are known zero.
     if (isTruncWithZeroHighBitsInput(Cond, DAG))
         Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
       if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
         CC = NewSetCC.getOperand(0);
         Cond = NewSetCC.getOperand(1);
         addTest = false;
       }
     }
   }
 
   if (addTest) {
     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
     Cond = EmitTest(Cond, X86Cond, dl, DAG);
   }
   Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                      Chain, Dest, CC, Cond);
 }
 
 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
 // Calls to _alloca are needed to probe the stack when allocating more than 4k
 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
 // that the guard pages used by the OS virtual memory manager are allocated in
 // correct sequence.
 SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
   bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
                SplitStack || EmitStackProbe;
   SDLoc dl(Op);
 
   // Get the inputs.
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   EVT VT = Node->getValueType(0);
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   bool Is64Bit = Subtarget.is64Bit();
   MVT SPTy = getPointerTy(DAG.getDataLayout());
 
   SDValue Result;
   if (!Lower) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
                     " not tell us which reg is the stack pointer!");
 
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
       Result = DAG.getNode(ISD::AND, dl, VT, Result,
                          DAG.getConstant(-(uint64_t)Align, dl, VT));
     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
   } else if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
     if (Is64Bit) {
       // The 64 bit implementation of segmented stacks needs to clobber both r10
       // r11. This makes it impossible to use it along with nested parameters.
       const Function &F = MF.getFunction();
       for (const auto &A : F.args()) {
         if (A.hasNestAttr())
           report_fatal_error("Cannot use segmented stacks with functions that "
                              "have nested arguments.");
       }
     }
 
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
   } else {
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
 
     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     unsigned SPReg = RegInfo->getStackRegister();
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
     Chain = SP.getValue(1);
 
     if (Align) {
       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
                        DAG.getConstant(-(uint64_t)Align, dl, VT));
       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
     }
 
     Result = SP;
   }
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
 
   SDValue Ops[2] = {Result, Chain};
   return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   auto PtrVT = getPointerTy(MF.getDataLayout());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   SDLoc DL(Op);
 
   if (!Subtarget.is64Bit() ||
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                         MachinePointerInfo(SV));
   }
 
   // __va_list_tag:
   //   gp_offset         (0 - 6 * 8)
   //   fp_offset         (48 - 48 + 8 * 16)
   //   overflow_arg_area (point to parameters coming in memory).
   //   reg_save_area
   SmallVector<SDValue, 8> MemOps;
   SDValue FIN = Op.getOperand(1);
   // Store gp_offset
   SDValue Store = DAG.getStore(
       Op.getOperand(0), DL,
       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
       MachinePointerInfo(SV));
   MemOps.push_back(Store);
 
   // Store fp_offset
   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
   Store = DAG.getStore(
       Op.getOperand(0), DL,
       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
       MachinePointerInfo(SV, 4));
   MemOps.push_back(Store);
 
   // Store ptr to overflow_arg_area
   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   Store =
       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
   MemOps.push_back(Store);
 
   // Store ptr to reg_save_area.
   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   Store = DAG.getStore(
       Op.getOperand(0), DL, RSFIN, FIN,
       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
   MemOps.push_back(Store);
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget.is64Bit() &&
          "LowerVAARG only handles 64-bit va_arg!");
   assert(Op.getNumOperands() == 4);
 
   MachineFunction &MF = DAG.getMachineFunction();
   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
     // The Win64 ABI uses char* instead of a structure.
     return DAG.expandVAArg(Op.getNode());
 
   SDValue Chain = Op.getOperand(0);
   SDValue SrcPtr = Op.getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   unsigned Align = Op.getConstantOperandVal(3);
   SDLoc dl(Op);
 
   EVT ArgVT = Op.getNode()->getValueType(0);
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
   uint8_t ArgMode;
 
   // Decide which area this value should be read from.
   // TODO: Implement the AMD64 ABI in its entirety. This simple
   // selection mechanism works only for the basic types.
   if (ArgVT == MVT::f80) {
     llvm_unreachable("va_arg for f80 not yet implemented");
   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   } else {
     llvm_unreachable("Unhandled argument type in LowerVAARG");
   }
 
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
     assert(!Subtarget.useSoftFloat() &&
            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
            Subtarget.hasSSE1());
   }
 
   // Insert VAARG_64 node into the DAG
   // VAARG_64 returns two values: Variable Argument Address, Chain
   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
                        DAG.getConstant(ArgMode, dl, MVT::i8),
                        DAG.getConstant(Align, dl, MVT::i32)};
   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(
     X86ISD::VAARG_64, dl,
     VTs, InstOps, MVT::i64,
     MachinePointerInfo(SV),
     /*Align=*/0,
     MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
   Chain = VAARG.getValue(1);
 
   // Load the next argument and return it
   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
 }
 
 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
   // where a va_list is still an i8*.
   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
   if (Subtarget.isCallingConvWin64(
         DAG.getMachineFunction().getFunction().getCallingConv()))
     // Probably a Win64 va_copy.
     return DAG.expandVACopy(Op.getNode());
 
   SDValue Chain = Op.getOperand(0);
   SDValue DstPtr = Op.getOperand(1);
   SDValue SrcPtr = Op.getOperand(2);
   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   SDLoc DL(Op);
 
   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
                        false, false,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
 /// Handle vector element shifts where the shift amount is a constant.
 /// Takes immediate version of shift as input.
 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                           SDValue SrcOp, uint64_t ShiftAmt,
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
 
   // Bitcast the source vector to the output type, this is mainly necessary for
   // vXi8/vXi64 shifts.
   if (VT != SrcOp.getSimpleValueType())
     SrcOp = DAG.getBitcast(VT, SrcOp);
 
   // Fold this packed shift into its first operand if ShiftAmt is 0.
   if (ShiftAmt == 0)
     return SrcOp;
 
   // Check for ShiftAmt >= element width
   if (ShiftAmt >= ElementType.getSizeInBits()) {
     if (Opc == X86ISD::VSRAI)
       ShiftAmt = ElementType.getSizeInBits() - 1;
     else
       return DAG.getConstant(0, dl, VT);
   }
 
   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
          && "Unknown target vector shift-by-constant node");
 
   // Fold this packed vector shift into a build vector if SrcOp is a
   // vector of Constants or UNDEFs.
   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
     SmallVector<SDValue, 8> Elts;
     unsigned NumElts = SrcOp->getNumOperands();
     ConstantSDNode *ND;
 
     switch(Opc) {
     default: llvm_unreachable("Unknown opcode!");
     case X86ISD::VSHLI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
         ND = cast<ConstantSDNode>(CurrentOp);
         const APInt &C = ND->getAPIntValue();
         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
       }
       break;
     case X86ISD::VSRLI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
         ND = cast<ConstantSDNode>(CurrentOp);
         const APInt &C = ND->getAPIntValue();
         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
       }
       break;
     case X86ISD::VSRAI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
         ND = cast<ConstantSDNode>(CurrentOp);
         const APInt &C = ND->getAPIntValue();
         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
       }
       break;
     }
 
     return DAG.getBuildVector(VT, dl, Elts);
   }
 
   return DAG.getNode(Opc, dl, VT, SrcOp,
                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
 }
 
 /// Handle vector element shifts where the shift amount may or may not be a
 /// constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT SVT = ShAmt.getSimpleValueType();
   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
 
   // Catch shift-by-constant.
   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
                                       CShAmt->getZExtValue(), DAG);
 
   // Change opcode to non-immediate version
   switch (Opc) {
     default: llvm_unreachable("Unknown target vector shift node");
     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
   }
 
   // Need to build a vector containing shift amount.
   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
   // +=================+============+=======================================+
   // | ShAmt is        | HasSSE4.1? | Construct ShAmt vector as             |
   // +=================+============+=======================================+
   // | i64             | Yes, No    | Use ShAmt as lowest elt               |
   // | i32             | Yes        | zero-extend in-reg                    |
   // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
   // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
   // +=================+============+=======================================+
 
   if (SVT == MVT::i64)
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
     ShAmt = ShAmt.getOperand(0);
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else {
     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
                         DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
   }
 
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
   MVT EltVT = VT.getVectorElementType();
   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
 
   ShAmt = DAG.getBitcast(ShVT, ShAmt);
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
 /// \brief Return Mask with the necessary casting or extending
 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
                            const SDLoc &dl) {
 
   if (isAllOnesConstant(Mask))
     return DAG.getConstant(1, dl, MaskVT);
   if (X86::isZeroNode(Mask))
     return DAG.getConstant(0, dl, MaskVT);
 
   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
     // Mask should be extended
     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
   }
 
   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
     if (MaskVT == MVT::v64i1) {
       assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
       SDValue Lo, Hi;
       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
                           DAG.getConstant(0, dl, MVT::i32));
       Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
                           DAG.getConstant(1, dl, MVT::i32));
 
       Lo = DAG.getBitcast(MVT::v32i1, Lo);
       Hi = DAG.getBitcast(MVT::v32i1, Hi);
 
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
     } else {
       // MaskVT require < 64bit. Truncate mask (should succeed in any case),
       // and bitcast.
       MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
       return DAG.getBitcast(MaskVT,
                             DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
     }
 
   } else {
     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
                                      Mask.getSimpleValueType().getSizeInBits());
     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
     // are extracted by EXTRACT_SUBVECTOR.
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
                        DAG.getBitcast(BitcastVT, Mask),
                        DAG.getIntPtrConstant(0, dl));
   }
 }
 
 /// \brief Return (and \p Op, \p Mask) for compare instructions or
 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
 /// necessary casting or extending for \p Mask when lowering masking intrinsics
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
                   SDValue PreservedSrc,
                   const X86Subtarget &Subtarget,
                   SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   unsigned OpcodeSelect = ISD::VSELECT;
   SDLoc dl(Op);
 
   if (isAllOnesConstant(Mask))
     return Op;
 
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
   switch (Op.getOpcode()) {
   default: break;
   case X86ISD::CMPM:
   case X86ISD::CMPM_RND:
   case X86ISD::CMPMU:
   case X86ISD::VPSHUFBITQMB:
     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
   case X86ISD::VFPCLASS:
     return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
   case X86ISD::VTRUNC:
   case X86ISD::VTRUNCS:
   case X86ISD::VTRUNCUS:
   case X86ISD::CVTPS2PH:
     // We can't use ISD::VSELECT here because it is not always "Legal"
     // for the destination type. For example vpmovqb require only AVX512
     // and vselect that can operate on byte element type require BWI
     OpcodeSelect = X86ISD::SELECT;
     break;
   }
   if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
 }
 
 /// \brief Creates an SDNode for a predicated scalar operation.
 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
 /// The mask is coming as MVT::i8 and it should be transformed
 /// to MVT::v1i1 while lowering masking intrinsics.
 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
 /// "X86select" instead of "vselect". We just can't create the "vselect" node
 /// for a scalar instruction.
 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
 
   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
     if (MaskConst->getZExtValue() & 0x1)
       return Op;
 
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
   if (Op.getOpcode() == X86ISD::FSETCCM ||
       Op.getOpcode() == X86ISD::FSETCCM_RND)
     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
   if (Op.getOpcode() == X86ISD::VFPCLASSS)
     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
 
   if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
 }
 
 static int getSEHRegistrationNodeSize(const Function *Fn) {
   if (!Fn->hasPersonalityFn())
     report_fatal_error(
         "querying registration node size for function without personality");
   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
   // WinEHStatePass for the full struct definition.
   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
   case EHPersonality::MSVC_X86SEH: return 24;
   case EHPersonality::MSVC_CXX: return 16;
   default: break;
   }
   report_fatal_error(
       "can only recover FP for 32-bit MSVC EH personality functions");
 }
 
 /// When the MSVC runtime transfers control to us, either to an outlined
 /// function or when returning to a parent frame after catching an exception, we
 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
 /// Here's the math:
 ///   RegNodeBase = EntryEBP - RegNodeSize
 ///   ParentFP = RegNodeBase - ParentFrameOffset
 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
 /// subtracting the offset (negative on x86) takes us back to the parent FP.
 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
                                    SDValue EntryEBP) {
   MachineFunction &MF = DAG.getMachineFunction();
   SDLoc dl;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
 
   // It's possible that the parent function no longer has a personality function
   // if the exceptional code was optimized away, in which case we just return
   // the incoming EBP.
   if (!Fn->hasPersonalityFn())
     return EntryEBP;
 
   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
   // registration, or the .set_setframe offset.
   MCSymbol *OffsetSym =
       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   SDValue ParentFrameOffset =
       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
 
   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
   // prologue to RBP in the parent function.
   const X86Subtarget &Subtarget =
       static_cast<const X86Subtarget &>(DAG.getSubtarget());
   if (Subtarget.is64Bit())
     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
 
   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
   // RegNodeBase = EntryEBP - RegNodeSize
   // ParentFP = RegNodeBase - ParentFrameOffset
   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
 }
 
 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
   auto isRoundModeCurDirection = [](SDValue Rnd) {
     if (!isa<ConstantSDNode>(Rnd))
       return false;
 
     unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
     return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
   };
 
   SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   MVT VT = Op.getSimpleValueType();
   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   if (IntrData) {
     switch(IntrData->Type) {
     case INTR_TYPE_1OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
     case INTR_TYPE_2OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2));
     case INTR_TYPE_3OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2), Op.getOperand(3));
     case INTR_TYPE_4OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
     case INTR_TYPE_1OP_MASK_RM: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       SDValue RoundingMode;
       // We always add rounding mode to the Node.
       // If the rounding mode is not specified, we add the
       // "current direction" mode.
       if (Op.getNumOperands() == 4)
         RoundingMode =
           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       else
         RoundingMode = Op.getOperand(4);
       assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
                                               RoundingMode),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_1OP_MASK: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       // We add rounding mode to the Node when
       //   - RM Opcode is specified and
       //   - RM is not "current direction".
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(4);
         if (!isRoundModeCurDirection(Rnd)) {
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(),
                                       Src, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_SCALAR_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue passThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       // There are 2 kinds of intrinsics in this group:
       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
       // (2) With rounding mode and sae - 7 operands.
       bool HasRounding = IntrWithRoundingModeOpcode != 0;
       if (Op.getNumOperands() == (5U + HasRounding)) {
         if (HasRounding) {
           SDValue Rnd = Op.getOperand(5);
           if (!isRoundModeCurDirection(Rnd))
             return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                                     dl, VT, Src1, Src2, Rnd),
                                         Mask, passThru, Subtarget, DAG);
         }
         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
                                                 Src2),
                                     Mask, passThru, Subtarget, DAG);
       }
 
       assert(Op.getNumOperands() == (6U + HasRounding) &&
              "Unexpected intrinsic form");
       SDValue RoundingMode = Op.getOperand(5);
       if (HasRounding) {
         SDValue Sae = Op.getOperand(6);
         if (!isRoundModeCurDirection(Sae))
           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                                   dl, VT, Src1, Src2,
                                                   RoundingMode, Sae),
                                       Mask, passThru, Subtarget, DAG);
       }
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
                                               Src2, RoundingMode),
                                   Mask, passThru, Subtarget, DAG);
     }
     case INTR_TYPE_SCALAR_MASK_RM: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src0 = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       // There are 2 kinds of intrinsics in this group:
       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
       // (2) With rounding mode and sae - 7 operands.
       if (Op.getNumOperands() == 6) {
         SDValue Sae  = Op.getOperand(5);
         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
                                                 Sae),
                                     Mask, Src0, Subtarget, DAG);
       }
       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
       SDValue RoundingMode  = Op.getOperand(5);
       SDValue Sae  = Op.getOperand(6);
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
                                               RoundingMode, Sae),
                                   Mask, Src0, Subtarget, DAG);
     }
     case INTR_TYPE_2OP_MASK:
     case INTR_TYPE_2OP_IMM8_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
 
       if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
 
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(5);
         if (!isRoundModeCurDirection(Rnd)) {
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(),
                                       Src1, Src2, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
       // TODO: Intrinsics should have fast-math-flags to propagate.
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_2OP_MASK_RM: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       // We specify 2 possible modes for intrinsics, with/without rounding
       // modes.
       // First, we check if the intrinsic have rounding mode (6 operands),
       // if not, we set rounding mode to "current".
       SDValue Rnd;
       if (Op.getNumOperands() == 6)
         Rnd = Op.getOperand(5);
       else
         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               Src1, Src2, Rnd),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_3OP_SCALAR_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue PassThru = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
 
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(6);
         if (!isRoundModeCurDirection(Rnd))
           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                                   dl, VT, Src1, Src2, Src3, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
       }
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
                                               Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_3OP_MASK_RM: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Imm = Op.getOperand(3);
       SDValue PassThru = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
       // We specify 2 possible modes for intrinsics, with/without rounding
       // modes.
       // First, we check if the intrinsic have rounding mode (7 operands),
       // if not, we set rounding mode to "current".
       SDValue Rnd;
       if (Op.getNumOperands() == 7)
         Rnd = Op.getOperand(6);
       else
         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               Src1, Src2, Imm, Rnd),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_3OP_IMM8_MASK:
     case INTR_TYPE_3OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue PassThru = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
 
       if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
 
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(6);
         if (!isRoundModeCurDirection(Rnd)) {
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(),
                                       Src1, Src2, Src3, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case VPERM_2OP_MASK : {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
 
       // Swap Src1 and Src2 in the node creation
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case VPERM_3OP_MASKZ:
     case VPERM_3OP_MASK:{
       MVT VT = Op.getSimpleValueType();
       // Src2 is the PassThru
       SDValue Src1 = Op.getOperand(1);
       // PassThru needs to be the same type as the destination in order
       // to pattern match correctly.
       SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
       SDValue Src3 = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       SDValue PassThru = SDValue();
 
       // set PassThru element
       if (IntrData->Type == VPERM_3OP_MASKZ)
         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
       else
         PassThru = Src2;
 
       // Swap Src1 and Src2 in the node creation
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
                                               dl, Op.getValueType(),
                                               Src2, Src1, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case FMA_OP_MASK3:
     case FMA_OP_MASKZ:
     case FMA_OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       MVT VT = Op.getSimpleValueType();
       SDValue PassThru = SDValue();
 
       // set PassThru element
       if (IntrData->Type == FMA_OP_MASKZ)
         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
       else if (IntrData->Type == FMA_OP_MASK3)
         PassThru = Src3;
       else
         PassThru = Src1;
 
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(5);
         if (!isRoundModeCurDirection(Rnd))
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                                   dl, Op.getValueType(),
                                                   Src1, Src2, Src3, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
       }
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
                                               dl, Op.getValueType(),
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case FMA_OP_SCALAR_MASK:
     case FMA_OP_SCALAR_MASK3:
     case FMA_OP_SCALAR_MASKZ: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       MVT VT = Op.getSimpleValueType();
       SDValue PassThru = SDValue();
 
       // set PassThru element
       if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
       else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
         PassThru = Src3;
       else
         PassThru = Src1;
 
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(5);
         if (!isRoundModeCurDirection(Rnd))
           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
                                                   Op.getValueType(), Src1, Src2,
                                                   Src3, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
       }
 
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
                                               Op.getValueType(), Src1, Src2,
                                               Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case IFMA_OP_MASKZ:
     case IFMA_OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       MVT VT = Op.getSimpleValueType();
       SDValue PassThru = Src1;
 
       // set PassThru element
       if (IntrData->Type == IFMA_OP_MASKZ)
         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
 
       // Node we need to swizzle the operands to pass the multiply operands
       // first.
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
                                               dl, Op.getValueType(),
                                               Src2, Src3, Src1),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case TERLOG_OP_MASK:
     case TERLOG_OP_MASKZ: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
       SDValue Mask = Op.getOperand(5);
       MVT VT = Op.getSimpleValueType();
       SDValue PassThru = Src1;
       // Set PassThru element.
       if (IntrData->Type == TERLOG_OP_MASKZ)
         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
 
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               Src1, Src2, Src3, Src4),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case CVTPD2PS:
       // ISD::FP_ROUND has a second argument that indicates if the truncation
       // does not change the value. Set it to 0 since it can change.
       return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
                          DAG.getIntPtrConstant(0, dl));
     case CVTPD2PS_MASK: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       // We add rounding mode to the Node when
       //   - RM Opcode is specified and
       //   - RM is not "current direction".
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(4);
         if (!isRoundModeCurDirection(Rnd)) {
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(),
                                       Src, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
       assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
       // ISD::FP_ROUND has a second argument that indicates if the truncation
       // does not change the value. Set it to 0 since it can change.
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
                                               DAG.getIntPtrConstant(0, dl)),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case FPCLASS: {
       // FPclass intrinsics with mask
        SDValue Src1 = Op.getOperand(1);
        MVT VT = Src1.getSimpleValueType();
        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
        SDValue Imm = Op.getOperand(2);
        SDValue Mask = Op.getOperand(3);
        MVT BitcastVT = MVT::getVectorVT(MVT::i1,
                                      Mask.getSimpleValueType().getSizeInBits());
        SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
        SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
                                                   Subtarget, DAG);
        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
                                  DAG.getUNDEF(BitcastVT), FPclassMask,
                                  DAG.getIntPtrConstant(0, dl));
        return DAG.getBitcast(Op.getValueType(), Res);
     }
     case FPCLASSS: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Imm = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
                                                  Subtarget, DAG);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
                          DAG.getIntPtrConstant(0, dl));
     }
     case CMP_MASK:
     case CMP_MASK_CC: {
       // Comparison intrinsics with masks.
       // Example of transformation:
       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
       // (i8 (bitcast
       //   (v8i1 (insert_subvector undef,
       //           (v2i1 (and (PCMPEQM %a, %b),
       //                      (extract_subvector
       //                         (v8i1 (bitcast %mask)), 0))), 0))))
       MVT VT = Op.getOperand(1).getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
                                        Mask.getSimpleValueType().getSizeInBits());
       SDValue Cmp;
       if (IntrData->Type == CMP_MASK_CC) {
         SDValue CC = Op.getOperand(3);
         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
         // We specify 2 possible opcodes for intrinsics with rounding modes.
         // First, we check if the intrinsic may have non-default rounding mode,
         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
         if (IntrData->Opc1 != 0) {
           SDValue Rnd = Op.getOperand(5);
           if (!isRoundModeCurDirection(Rnd))
             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
                               Op.getOperand(2), CC, Rnd);
         }
         //default rounding mode
         if(!Cmp.getNode())
             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
                               Op.getOperand(2), CC);
 
       } else {
         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
                           Op.getOperand(2));
       }
       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
                                              Subtarget, DAG);
       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
                                 DAG.getUNDEF(BitcastVT), CmpMask,
                                 DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(Op.getValueType(), Res);
     }
     case CMP_MASK_SCALAR_CC: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
       SDValue Mask = Op.getOperand(4);
 
       SDValue Cmp;
       if (IntrData->Opc1 != 0) {
         SDValue Rnd = Op.getOperand(5);
         if (!isRoundModeCurDirection(Rnd))
           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
       }
       //default rounding mode
       if(!Cmp.getNode())
         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
 
       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
                                              Subtarget, DAG);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
                          DAG.getIntPtrConstant(0, dl));
     }
     case COMI: { // Comparison intrinsics
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
       SDValue LHS = Op.getOperand(1);
       SDValue RHS = Op.getOperand(2);
       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
       SDValue SetCC;
       switch (CC) {
       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
         break;
       }
       case ISD::SETNE: { // (ZF = 1 or PF = 1)
         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
         break;
       }
       case ISD::SETGT: // (CF = 0 and ZF = 0)
         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
         break;
       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
         SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
         break;
       }
       case ISD::SETGE: // CF = 0
         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
         break;
       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
         SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
         break;
       default:
         llvm_unreachable("Unexpected illegal condition!");
       }
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
     }
     case COMI_RM: { // Comparison intrinsics with Sae
       SDValue LHS = Op.getOperand(1);
       SDValue RHS = Op.getOperand(2);
       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
       SDValue Sae = Op.getOperand(4);
 
       SDValue FCmp;
       if (isRoundModeCurDirection(Sae))
         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
                            DAG.getConstant(CondVal, dl, MVT::i8));
       else
         FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
                            DAG.getConstant(CondVal, dl, MVT::i8), Sae);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
                          DAG.getIntPtrConstant(0, dl));
     }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
                                  DAG);
     case COMPRESS_EXPAND_IN_REG: {
       SDValue Mask = Op.getOperand(3);
       SDValue DataToCompress = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       if (isAllOnesConstant(Mask)) // return data as is
         return Op.getOperand(1);
 
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               DataToCompress),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case BROADCASTM: {
       SDValue Mask = Op.getOperand(1);
       MVT MaskVT = MVT::getVectorVT(MVT::i1,
                                     Mask.getSimpleValueType().getSizeInBits());
       Mask = DAG.getBitcast(MaskVT, Mask);
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
     }
     case MASK_BINOP: {
       MVT VT = Op.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
 
       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
       SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
       return DAG.getBitcast(VT, Res);
     }
     case FIXUPIMMS:
     case FIXUPIMMS_MASKZ:
     case FIXUPIMM:
     case FIXUPIMM_MASKZ:{
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue Imm = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
       // We specify 2 possible modes for intrinsics, with/without rounding
       // modes.
       // First, we check if the intrinsic have rounding mode (7 operands),
       // if not, we set rounding mode to "current".
       SDValue Rnd;
       if (Op.getNumOperands() == 7)
         Rnd = Op.getOperand(6);
       else
         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                                 Src1, Src2, Src3, Imm, Rnd),
                                     Mask, Passthru, Subtarget, DAG);
       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                        Src1, Src2, Src3, Imm, Rnd),
                                     Mask, Passthru, Subtarget, DAG);
     }
     case CONVERT_TO_MASK: {
       MVT SrcVT = Op.getOperand(1).getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
       MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
 
       SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
                                     Op.getOperand(1));
       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
                                 DAG.getUNDEF(BitcastVT), CvtMask,
                                 DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(Op.getValueType(), Res);
     }
     case ROUNDP: {
       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
       // Clear the upper bits of the rounding immediate so that the legacy
       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
       SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
                                          Op.getOperand(2),
                                          DAG.getConstant(0xf, dl, MVT::i32));
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), RoundingMode);
     }
     case ROUNDS: {
       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
       // Clear the upper bits of the rounding immediate so that the legacy
       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
       SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
                                          Op.getOperand(3),
                                          DAG.getConstant(0xf, dl, MVT::i32));
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
     }
     default:
       break;
     }
   }
 
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
 
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
     // Operands intentionally swapped. Mask is last operand to intrinsic,
     // but second operand for node/instruction.
     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(1));
 
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
   // or testp pattern and a setcc for the result.
   case Intrinsic::x86_sse41_ptestz:
   case Intrinsic::x86_sse41_ptestc:
   case Intrinsic::x86_sse41_ptestnzc:
   case Intrinsic::x86_avx_ptestz_256:
   case Intrinsic::x86_avx_ptestc_256:
   case Intrinsic::x86_avx_ptestnzc_256:
   case Intrinsic::x86_avx_vtestz_ps:
   case Intrinsic::x86_avx_vtestc_ps:
   case Intrinsic::x86_avx_vtestnzc_ps:
   case Intrinsic::x86_avx_vtestz_pd:
   case Intrinsic::x86_avx_vtestc_pd:
   case Intrinsic::x86_avx_vtestnzc_pd:
   case Intrinsic::x86_avx_vtestz_ps_256:
   case Intrinsic::x86_avx_vtestc_ps_256:
   case Intrinsic::x86_avx_vtestnzc_ps_256:
   case Intrinsic::x86_avx_vtestz_pd_256:
   case Intrinsic::x86_avx_vtestc_pd_256:
   case Intrinsic::x86_avx_vtestnzc_pd_256: {
     bool IsTestPacked = false;
     X86::CondCode X86CC;
     switch (IntNo) {
     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
     case Intrinsic::x86_avx_vtestz_ps:
     case Intrinsic::x86_avx_vtestz_pd:
     case Intrinsic::x86_avx_vtestz_ps_256:
     case Intrinsic::x86_avx_vtestz_pd_256:
       IsTestPacked = true;
       LLVM_FALLTHROUGH;
     case Intrinsic::x86_sse41_ptestz:
     case Intrinsic::x86_avx_ptestz_256:
       // ZF = 1
       X86CC = X86::COND_E;
       break;
     case Intrinsic::x86_avx_vtestc_ps:
     case Intrinsic::x86_avx_vtestc_pd:
     case Intrinsic::x86_avx_vtestc_ps_256:
     case Intrinsic::x86_avx_vtestc_pd_256:
       IsTestPacked = true;
       LLVM_FALLTHROUGH;
     case Intrinsic::x86_sse41_ptestc:
     case Intrinsic::x86_avx_ptestc_256:
       // CF = 1
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_avx_vtestnzc_ps:
     case Intrinsic::x86_avx_vtestnzc_pd:
     case Intrinsic::x86_avx_vtestnzc_ps_256:
     case Intrinsic::x86_avx_vtestnzc_pd_256:
       IsTestPacked = true;
       LLVM_FALLTHROUGH;
     case Intrinsic::x86_sse41_ptestnzc:
     case Intrinsic::x86_avx_ptestnzc_256:
       // ZF and CF = 0
       X86CC = X86::COND_A;
       break;
     }
 
     SDValue LHS = Op.getOperand(1);
     SDValue RHS = Op.getOperand(2);
     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
   case Intrinsic::x86_avx512_kortestz_w:
   case Intrinsic::x86_avx512_kortestc_w: {
     X86::CondCode X86CC =
         (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
   case Intrinsic::x86_avx512_knot_w: {
     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
     SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
     SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
     return DAG.getBitcast(MVT::i16, Res);
   }
 
   case Intrinsic::x86_avx512_kandn_w: {
     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
     // Invert LHS for the not.
     LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
                       DAG.getConstant(1, dl, MVT::v16i1));
     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
     SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
     return DAG.getBitcast(MVT::i16, Res);
   }
 
   case Intrinsic::x86_avx512_kxnor_w: {
     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
     SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
     // Invert result for the not.
     Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
                       DAG.getConstant(1, dl, MVT::v16i1));
     return DAG.getBitcast(MVT::i16, Res);
   }
 
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
   case Intrinsic::x86_sse42_pcmpestric128:
   case Intrinsic::x86_sse42_pcmpistrio128:
   case Intrinsic::x86_sse42_pcmpestrio128:
   case Intrinsic::x86_sse42_pcmpistris128:
   case Intrinsic::x86_sse42_pcmpestris128:
   case Intrinsic::x86_sse42_pcmpistriz128:
   case Intrinsic::x86_sse42_pcmpestriz128: {
     unsigned Opcode;
     X86::CondCode X86CC;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::x86_sse42_pcmpistria128:
       Opcode = X86ISD::PCMPISTRI;
       X86CC = X86::COND_A;
       break;
     case Intrinsic::x86_sse42_pcmpestria128:
       Opcode = X86ISD::PCMPESTRI;
       X86CC = X86::COND_A;
       break;
     case Intrinsic::x86_sse42_pcmpistric128:
       Opcode = X86ISD::PCMPISTRI;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_sse42_pcmpestric128:
       Opcode = X86ISD::PCMPESTRI;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_sse42_pcmpistrio128:
       Opcode = X86ISD::PCMPISTRI;
       X86CC = X86::COND_O;
       break;
     case Intrinsic::x86_sse42_pcmpestrio128:
       Opcode = X86ISD::PCMPESTRI;
       X86CC = X86::COND_O;
       break;
     case Intrinsic::x86_sse42_pcmpistris128:
       Opcode = X86ISD::PCMPISTRI;
       X86CC = X86::COND_S;
       break;
     case Intrinsic::x86_sse42_pcmpestris128:
       Opcode = X86ISD::PCMPESTRI;
       X86CC = X86::COND_S;
       break;
     case Intrinsic::x86_sse42_pcmpistriz128:
       Opcode = X86ISD::PCMPISTRI;
       X86CC = X86::COND_E;
       break;
     case Intrinsic::x86_sse42_pcmpestriz128:
       Opcode = X86ISD::PCMPESTRI;
       X86CC = X86::COND_E;
       break;
     }
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
     SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
   case Intrinsic::x86_sse42_pcmpistri128:
   case Intrinsic::x86_sse42_pcmpestri128: {
     unsigned Opcode;
     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
       Opcode = X86ISD::PCMPISTRI;
     else
       Opcode = X86ISD::PCMPESTRI;
 
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
 
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
     auto &Context = MF.getMMI().getContext();
     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
                                             Twine(MF.getFunctionNumber()));
     return DAG.getNode(getGlobalWrapperKind(), dl, VT,
                        DAG.getMCSymbol(S, PtrVT));
   }
 
   case Intrinsic::x86_seh_lsda: {
     // Compute the symbol for the LSDA. We know it'll get emitted later.
     MachineFunction &MF = DAG.getMachineFunction();
     SDValue Op1 = Op.getOperand(1);
     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
 
     // Generate a simple absolute symbol reference. This intrinsic is only
     // supported on 32-bit Windows, which isn't PIC.
     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
   }
 
   case Intrinsic::x86_seh_recoverfp: {
     SDValue FnOp = Op.getOperand(1);
     SDValue IncomingFPOp = Op.getOperand(2);
     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
     if (!Fn)
       report_fatal_error(
           "llvm.x86.seh.recoverfp must take a function as the first argument");
     return recoverFramePointer(DAG, Fn, IncomingFPOp);
   }
 
   case Intrinsic::localaddress: {
     // Returns one of the stack, base, or frame pointer registers, depending on
     // which is used to reference local variables.
     MachineFunction &MF = DAG.getMachineFunction();
     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     unsigned Reg;
     if (RegInfo->hasBasePointer(MF))
       Reg = RegInfo->getBaseRegister();
     else // This function handles the SP or FP case.
       Reg = RegInfo->getPtrSizedFrameRegister(MF);
     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
   }
   }
 }
 
 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                  SDValue Src, SDValue Mask, SDValue Base,
                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
                                  const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
   if (!C)
     return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   EVT MaskVT = Mask.getValueType();
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   // If source is undef or we know it won't be used, use a zero vector
   // to break register dependency.
   // TODO: use undef instead and let ExecutionDepsFix deal with it?
   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   return DAG.getMergeValues(RetOps, dl);
 }
 
 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Src, SDValue Mask, SDValue Base,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
   if (!C)
     return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
 
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   // If source is undef or we know it won't be used, use a zero vector
   // to break register dependency.
   // TODO: use undef instead and let ExecutionDepsFix deal with it?
   if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   return DAG.getMergeValues(RetOps, dl);
 }
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Src, SDValue Mask, SDValue Base,
                                SDValue Index, SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
   if (!C)
     return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
 
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   return SDValue(Res, 1);
 }
 
 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Mask, SDValue Base, SDValue Index,
                                SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
   if (!C)
     return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   return SDValue(Res, 0);
 }
 
 /// Handles the lowering of builtin intrinsic that return the value
 /// of the extended control register.
 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
                                        SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget,
                                        SmallVectorImpl<SDValue> &Results) {
   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue LO, HI;
 
   // The ECX register is used to select the index of the XCR register to
   // return.
   SDValue Chain =
       DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
   SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
   Chain = SDValue(N1, 0);
 
   // Reads the content of XCR and returns it in registers EDX:EAX.
   if (Subtarget.is64Bit()) {
     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
                             LO.getValue(2));
   } else {
     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
                             LO.getValue(2));
   }
   Chain = HI.getValue(1);
 
   if (Subtarget.is64Bit()) {
     // Merge the two 32-bit values into a 64-bit one..
     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
                               DAG.getConstant(32, DL, MVT::i8));
     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
     Results.push_back(Chain);
     return;
   }
 
   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   SDValue Ops[] = { LO, HI };
   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   Results.push_back(Pair);
   Results.push_back(Chain);
 }
 
 /// Handles the lowering of builtin intrinsics that read performance monitor
 /// counters (x86_rdpmc).
 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
                                       SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget,
                                       SmallVectorImpl<SDValue> &Results) {
   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue LO, HI;
 
   // The ECX register is used to select the index of the performance counter
   // to read.
   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
                                    N->getOperand(2));
   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
 
   // Reads the content of a 64-bit performance counter and returns it in the
   // registers EDX:EAX.
   if (Subtarget.is64Bit()) {
     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
                             LO.getValue(2));
   } else {
     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
                             LO.getValue(2));
   }
   Chain = HI.getValue(1);
 
   if (Subtarget.is64Bit()) {
     // The EAX register is loaded with the low-order 32 bits. The EDX register
     // is loaded with the supported high-order bits of the counter.
     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
                               DAG.getConstant(32, DL, MVT::i8));
     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
     Results.push_back(Chain);
     return;
   }
 
   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   SDValue Ops[] = { LO, HI };
   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   Results.push_back(Pair);
   Results.push_back(Chain);
 }
 
 /// Handles the lowering of builtin intrinsics that read the time stamp counter
 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
 /// READCYCLECOUNTER nodes.
 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
                                     SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget,
                                     SmallVectorImpl<SDValue> &Results) {
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
   SDValue LO, HI;
 
   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   // and the EAX register is loaded with the low-order 32 bits.
   if (Subtarget.is64Bit()) {
     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
                             LO.getValue(2));
   } else {
     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
                             LO.getValue(2));
   }
   SDValue Chain = HI.getValue(1);
 
   if (Opcode == X86ISD::RDTSCP_DAG) {
     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
 
     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
     // the ECX register. Add 'ecx' explicitly to the chain.
     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
                                      HI.getValue(2));
     // Explicitly store the content of ECX at the location passed in input
     // to the 'rdtscp' intrinsic.
     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
                          MachinePointerInfo());
   }
 
   if (Subtarget.is64Bit()) {
     // The EDX register is loaded with the high-order 32 bits of the MSR, and
     // the EAX register is loaded with the low-order 32 bits.
     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
                               DAG.getConstant(32, DL, MVT::i8));
     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
     Results.push_back(Chain);
     return;
   }
 
   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   SDValue Ops[] = { LO, HI };
   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   Results.push_back(Pair);
   Results.push_back(Chain);
 }
 
 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   SmallVector<SDValue, 2> Results;
   SDLoc DL(Op);
   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
                           Results);
   return DAG.getMergeValues(Results, DL);
 }
 
 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain = Op.getOperand(0);
   SDValue RegNode = Op.getOperand(2);
   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   if (!EHInfo)
     report_fatal_error("EH registrations only live in functions using WinEH");
 
   // Cast the operand to an alloca, and remember the frame index.
   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
   if (!FINode)
     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
 
   // Return the chain operand without making any DAG nodes.
   return Chain;
 }
 
 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain = Op.getOperand(0);
   SDValue EHGuard = Op.getOperand(2);
   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   if (!EHInfo)
     report_fatal_error("EHGuard only live in functions using WinEH");
 
   // Cast the operand to an alloca, and remember the frame index.
   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
   if (!FINode)
     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
   EHInfo->EHGuardFrameIndex = FINode->getIndex();
 
   // Return the chain operand without making any DAG nodes.
   return Chain;
 }
 
 /// Emit Truncating Store with signed or unsigned saturation.
 static SDValue
 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
                 SelectionDAG &DAG) {
 
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
   SDValue Ops[] = { Chain, Val, Ptr, Undef };
   return SignedSat ?
     DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
     DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
 }
 
 /// Emit Masked Truncating Store with signed or unsigned saturation.
 static SDValue
 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
                       MachineMemOperand *MMO, SelectionDAG &DAG) {
 
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = { Chain, Ptr, Mask, Val };
   return SignedSat ?
     DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
     DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
 }
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 
   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
   if (!IntrData) {
     switch (IntNo) {
     case llvm::Intrinsic::x86_seh_ehregnode:
       return MarkEHRegistrationNode(Op, DAG);
     case llvm::Intrinsic::x86_seh_ehguard:
       return MarkEHGuard(Op, DAG);
     case llvm::Intrinsic::x86_flags_read_u32:
     case llvm::Intrinsic::x86_flags_read_u64:
     case llvm::Intrinsic::x86_flags_write_u32:
     case llvm::Intrinsic::x86_flags_write_u64: {
       // We need a frame pointer because this will get lowered to a PUSH/POP
       // sequence.
       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
       MFI.setHasCopyImplyingStackAdjustment(true);
       // Don't do anything here, we will expand these intrinsics out later
       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
       return SDValue();
     }
     case Intrinsic::x86_lwpins32:
     case Intrinsic::x86_lwpins64: {
       SDLoc dl(Op);
       SDValue Chain = Op->getOperand(0);
       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
       SDValue LwpIns =
           DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
                       Op->getOperand(3), Op->getOperand(4));
       SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
       SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
                          LwpIns.getValue(1));
     }
     }
     return SDValue();
   }
 
   SDLoc dl(Op);
   switch(IntrData->Type) {
   default: llvm_unreachable("Unknown Intrinsic Type");
   case RDSEED:
   case RDRAND: {
     // Emit the node with the right value type.
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
 
     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
     // Otherwise return the value from Rand, which is always 0, casted to i32.
     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
                       DAG.getConstant(1, dl, Op->getValueType(1)),
                       DAG.getConstant(X86::COND_B, dl, MVT::i8),
                       SDValue(Result.getNode(), 1) };
     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
 
     // Return { result, isValid, chain }.
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
   case GATHER_AVX2: {
     SDValue Chain = Op.getOperand(0);
     SDValue Src   = Op.getOperand(2);
     SDValue Base  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
     SDValue Mask  = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
                              Scale, Chain, Subtarget);
   }
   case GATHER: {
   //gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
     SDValue Src   = Op.getOperand(2);
     SDValue Base  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
     SDValue Mask  = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
                          Chain, Subtarget);
   }
   case SCATTER: {
   //scatter(base, mask, index, v1, scale);
     SDValue Chain = Op.getOperand(0);
     SDValue Base  = Op.getOperand(2);
     SDValue Mask  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
     SDValue Src   = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
                           Scale, Chain, Subtarget);
   }
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
     assert((HintVal == 2 || HintVal == 3) &&
            "Wrong prefetch hint in intrinsic: should be 2 or 3");
     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
     SDValue Chain = Op.getOperand(0);
     SDValue Mask  = Op.getOperand(2);
     SDValue Index = Op.getOperand(3);
     SDValue Base  = Op.getOperand(4);
     SDValue Scale = Op.getOperand(5);
     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
                            Subtarget);
   }
   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   case RDTSC: {
     SmallVector<SDValue, 2> Results;
     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
                             Results);
     return DAG.getMergeValues(Results, dl);
   }
   // Read Performance Monitoring Counters.
   case RDPMC: {
     SmallVector<SDValue, 2> Results;
     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
     return DAG.getMergeValues(Results, dl);
   }
   // Get Extended Control Register.
   case XGETBV: {
     SmallVector<SDValue, 2> Results;
     getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
     return DAG.getMergeValues(Results, dl);
   }
   // XTEST intrinsics.
   case XTEST: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
 
     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
                        Ret, SDValue(InTrans.getNode(), 1));
   }
   // ADC/ADCX/SBB
   case ADX: {
     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
     SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
                                 DAG.getConstant(-1, dl, MVT::i8));
     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
                               Op.getOperand(4), GenCF.getValue(1));
     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
                                  Op.getOperand(5), MachinePointerInfo());
     SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
     SDValue Results[] = { SetCC, Store };
     return DAG.getMergeValues(Results, dl);
   }
   case COMPRESS_TO_MEM: {
     SDValue Mask = Op.getOperand(4);
     SDValue DataToCompress = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
     MVT VT = DataToCompress.getSimpleValueType();
 
     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
     assert(MemIntr && "Expected MemIntrinsicSDNode!");
 
     if (isAllOnesConstant(Mask)) // return just a store
       return DAG.getStore(Chain, dl, DataToCompress, Addr,
                           MemIntr->getMemOperand());
 
     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
     return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
                               MemIntr->getMemOperand(),
                               false /* truncating */, true /* compressing */);
   }
   case TRUNCATE_TO_MEM_VI8:
   case TRUNCATE_TO_MEM_VI16:
   case TRUNCATE_TO_MEM_VI32: {
     SDValue Mask = Op.getOperand(4);
     SDValue DataToTruncate = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
 
     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
     assert(MemIntr && "Expected MemIntrinsicSDNode!");
 
     EVT MemVT  = MemIntr->getMemoryVT();
 
     uint16_t TruncationOp = IntrData->Opc0;
     switch (TruncationOp) {
     case X86ISD::VTRUNC: {
       if (isAllOnesConstant(Mask)) // return just a truncate store
         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
                                  MemIntr->getMemOperand());
 
       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
                                 MemIntr->getMemOperand(), true /* truncating */);
     }
     case X86ISD::VTRUNCUS:
     case X86ISD::VTRUNCS: {
       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
       if (isAllOnesConstant(Mask))
         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
                                MemIntr->getMemOperand(), DAG);
 
       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
     }
     default:
       llvm_unreachable("Unsupported truncstore intrinsic");
     }
   }
 
   case EXPAND_FROM_MEM: {
     SDValue Mask = Op.getOperand(4);
     SDValue PassThru = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
     MVT VT = Op.getSimpleValueType();
 
     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
     assert(MemIntr && "Expected MemIntrinsicSDNode!");
 
     if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
       return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
     if (X86::isZeroNode(Mask))
       return DAG.getUNDEF(VT);
 
     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
     return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
                              MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
                              true /* expanding */);
   }
   }
 }
 
 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setReturnAddressIsTaken(true);
 
   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
                        MachinePointerInfo());
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
                      MachinePointerInfo());
 }
 
 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
   return getReturnAddressFrameIndex(DAG);
 }
 
 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   EVT VT = Op.getValueType();
 
   MFI.setFrameAddressIsTaken(true);
 
   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
     // is not possible to crawl up the stack without looking at the unwind codes
     // simultaneously.
     int FrameAddrIndex = FuncInfo->getFAIndex();
     if (!FrameAddrIndex) {
       // Set up a frame object for the return address.
       unsigned SlotSize = RegInfo->getSlotSize();
       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
       FuncInfo->setFAIndex(FrameAddrIndex);
     }
     return DAG.getFrameIndex(FrameAddrIndex, VT);
   }
 
   unsigned FrameReg =
       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
          "Invalid Frame Register!");
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo());
   return FrameAddr;
 }
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                               SelectionDAG &DAG) const {
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   const MachineFunction &MF = DAG.getMachineFunction();
 
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("esp", X86::ESP)
                        .Case("rsp", X86::RSP)
                        .Case("ebp", X86::EBP)
                        .Case("rbp", X86::RBP)
                        .Default(0);
 
   if (Reg == X86::EBP || Reg == X86::RBP) {
     if (!TFI.hasFP(MF))
       report_fatal_error("register " + StringRef(RegName) +
                          " is allocatable: function has no frame pointer");
 #ifndef NDEBUG
     else {
       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
       unsigned FrameReg =
           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
              "Invalid Frame Register!");
     }
 #endif
   }
 
   if (Reg)
     return Reg;
 
   report_fatal_error("Invalid register name global variable");
 }
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
 }
 
 unsigned X86TargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
 
   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
 }
 
 unsigned X86TargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   // Funclet personalities don't use selectors (the runtime does the selection).
   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
 }
 
 bool X86TargetLowering::needsFixedCatchObjects() const {
   return Subtarget.isTargetWin64();
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   SDLoc dl      (Op);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
          "Invalid Frame Register!");
   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
 
   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
                                                        dl));
   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
 
   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
                      DAG.getRegister(StoreAddrReg, PtrVT));
 }
 
 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
   // If the subtarget is not 64bit, we may need the global base reg
   // after isel expand pseudo, i.e., after CGBR pass ran.
   // Therefore, ask for the GlobalBaseReg now, so that the pass
   // inserts the code for us in case we need it.
   // Otherwise, we will end up in a situation where we will
   // reference a virtual register that is not defined!
   if (!Subtarget.is64Bit()) {
     const X86InstrInfo *TII = Subtarget.getInstrInfo();
     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
   }
   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other),
                      Op.getOperand(0), Op.getOperand(1));
 }
 
 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
                      Op.getOperand(0), Op.getOperand(1));
 }
 
 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
                                                        SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
                      Op.getOperand(0));
 }
 
 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   return Op.getOperand(0);
 }
 
 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDValue Root = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   if (Subtarget.is64Bit()) {
     SDValue OutChains[6];
 
     // Large code-model.
     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
 
     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
 
     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
 
     // Load the pointer to the nested function into R11.
     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
     SDValue Addr = Trmp;
     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
                                 Addr, MachinePointerInfo(TrmpAddr));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(2, dl, MVT::i64));
     OutChains[1] =
         DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
                      /* Alignment = */ 2);
 
     // Load the 'nest' parameter value into R10.
     // R10 is specified in X86CallingConv.td
     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(10, dl, MVT::i64));
     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
                                 Addr, MachinePointerInfo(TrmpAddr, 10));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(12, dl, MVT::i64));
     OutChains[3] =
         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
                      /* Alignment = */ 2);
 
     // Jump to the nested function.
     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(20, dl, MVT::i64));
     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
                                 Addr, MachinePointerInfo(TrmpAddr, 20));
 
     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(22, dl, MVT::i64));
     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
                                 Addr, MachinePointerInfo(TrmpAddr, 22));
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   } else {
     const Function *Func =
       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
     CallingConv::ID CC = Func->getCallingConv();
     unsigned NestReg;
 
     switch (CC) {
     default:
       llvm_unreachable("Unsupported calling convention");
     case CallingConv::C:
     case CallingConv::X86_StdCall: {
       // Pass 'nest' parameter in ECX.
       // Must be kept in sync with X86CallingConv.td
       NestReg = X86::ECX;
 
       // Check that ECX wasn't needed by an 'inreg' parameter.
       FunctionType *FTy = Func->getFunctionType();
       const AttributeList &Attrs = Func->getAttributes();
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
         unsigned InRegCount = 0;
         unsigned Idx = 1;
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
              E = FTy->param_end(); I != E; ++I, ++Idx)
           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
             auto &DL = DAG.getDataLayout();
             // FIXME: should only count parameters that are lowered to integers.
             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
           }
 
         if (InRegCount > 2) {
           report_fatal_error("Nest register in use - reduce number of inreg"
                              " parameters!");
         }
       }
       break;
     }
     case CallingConv::X86_FastCall:
     case CallingConv::X86_ThisCall:
     case CallingConv::Fast:
       // Pass 'nest' parameter in EAX.
       // Must be kept in sync with X86CallingConv.td
       NestReg = X86::EAX;
       break;
     }
 
     SDValue OutChains[4];
     SDValue Addr, Disp;
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(10, dl, MVT::i32));
     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
 
     // This is storing the opcode for MOV32ri.
     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
     OutChains[0] =
         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
                      Trmp, MachinePointerInfo(TrmpAddr));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(1, dl, MVT::i32));
     OutChains[1] =
         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
                      /* Alignment = */ 1);
 
     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(5, dl, MVT::i32));
     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
                                 Addr, MachinePointerInfo(TrmpAddr, 5),
                                 /* Alignment = */ 1);
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(6, dl, MVT::i32));
     OutChains[3] =
         DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
                      /* Alignment = */ 1);
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   }
 }
 
 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                             SelectionDAG &DAG) const {
   /*
    The rounding mode is in bits 11:10 of FPSR, and has the following
    settings:
      00 Round to nearest
      01 Round to -inf
      10 Round to +inf
      11 Round to 0
 
   FLT_ROUNDS, on the other hand, expects the following:
     -1 Undefined
      0 Round to 0
      1 Round to nearest
      2 Round to +inf
      3 Round to -inf
 
   To perform the conversion, we do:
     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
   */
 
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
 
   // Save FP Control Word to stack slot
   int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
   SDValue StackSlot =
       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
 
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
                               MachineMemOperand::MOStore, 2, 2);
 
   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
                                           DAG.getVTList(MVT::Other),
                                           Ops, MVT::i16, MMO);
 
   // Load FP Control Word from stack slot
   SDValue CWD =
       DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
 
   // Transform as necessary
   SDValue CWD1 =
     DAG.getNode(ISD::SRL, DL, MVT::i16,
                 DAG.getNode(ISD::AND, DL, MVT::i16,
                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
                 DAG.getConstant(11, DL, MVT::i8));
   SDValue CWD2 =
     DAG.getNode(ISD::SRL, DL, MVT::i16,
                 DAG.getNode(ISD::AND, DL, MVT::i16,
                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
                 DAG.getConstant(9, DL, MVT::i8));
 
   SDValue RetVal =
     DAG.getNode(ISD::AND, DL, MVT::i16,
                 DAG.getNode(ISD::ADD, DL, MVT::i16,
                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
                             DAG.getConstant(1, DL, MVT::i16)),
                 DAG.getConstant(3, DL, MVT::i16));
 
   return DAG.getNode((VT.getSizeInBits() < 16 ?
                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
 }
 
 // Split an unary integer op into 2 half sized ops.
 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumElems = VT.getVectorNumElements();
   unsigned SizeInBits = VT.getSizeInBits();
 
   // Extract the Lo/Hi vectors
   SDLoc dl(Op);
   SDValue Src = Op.getOperand(0);
   SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
   SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
 
   MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
 }
 
 // Decompose 256-bit ops into smaller 128-bit ops.
 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return LowerVectorIntUnary(Op, DAG);
 }
 
 // Decompose 512-bit ops into smaller 256-bit ops.
 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().is512BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 512-bit vector integer operation");
   return LowerVectorIntUnary(Op, DAG);
 }
 
 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
 //
 // i8/i16 vector implemented using dword LZCNT vector instruction
 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
 // split the vector, perform operation on it's Lo a Hi part and
 // concatenate the results.
 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::CTLZ);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
           "Unsupported element type");
 
   // Split vector, it's Lo and Hi parts will be handled in next iteration.
   if (16 < NumElems)
     return LowerVectorIntUnary(Op, DAG);
 
   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
           "Unsupported value type for operation");
 
   // Use native supported vector instruction vplzcntd.
   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
 
   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
 }
 
 // Lower CTLZ using a PSHUFB lookup table implementation.
 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   int NumElts = VT.getVectorNumElements();
   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
 
   // Per-nibble leading zero PSHUFB lookup table.
   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
 
   SmallVector<SDValue, 64> LUTVec;
   for (int i = 0; i < NumBytes; ++i)
     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
 
   // Begin by bitcasting the input to byte vector, then split those bytes
   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
   // If the hi input nibble is zero then we add both results together, otherwise
   // we just take the hi result (by masking the lo result to zero before the
   // add).
   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
 
   SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
   SDValue HiZ;
   if (CurrVT.is512BitVector()) {
     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
   } else {
     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
   }
 
   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
 
   // Merge result back from vXi8 back to VT, working on the lo/hi halves
   // of the current vector width in the same way we did for the nibbles.
   // If the upper half of the input element is zero then add the halves'
   // leading zero counts together, otherwise just use the upper half's.
   // Double the width of the result until we are at target width.
   while (CurrVT != VT) {
     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
     int CurrNumElts = CurrVT.getVectorNumElements();
     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
 
     // Check if the upper half of the input element is zero.
     if (CurrVT.is512BitVector()) {
       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
     } else {
       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
     }
     HiZ = DAG.getBitcast(NextVT, HiZ);
 
     // Move the upper/lower halves to the lower bits as we'll be extending to
     // NextVT. Mask the lower result to zero if HiZ is true and add the results
     // together.
     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
     CurrVT = NextVT;
   }
 
   return Res;
 }
 
 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
                                const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   if (Subtarget.hasCDI())
     return LowerVectorCTLZ_AVX512CDI(Op, DAG);
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
 
   // Decompose 512-bit ops into smaller 256-bit ops.
   if (VT.is512BitVector() && !Subtarget.hasBWI())
     return Lower512IntUnary(Op, DAG);
 
   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
 }
 
 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
                          SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
   SDLoc dl(Op);
   unsigned Opc = Op.getOpcode();
 
   if (VT.isVector())
     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
     // Zero extend to i32 since there is not an i8 bsr.
     OpVT = MVT::i32;
     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   }
 
   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
 
   if (Opc == ISD::CTLZ) {
     // If src is zero (i.e. bsr sets ZF), returns NumBits.
     SDValue Ops[] = {
       Op,
       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
       DAG.getConstant(X86::COND_E, dl, MVT::i8),
       Op.getValue(1)
     };
     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
   }
 
   // Finally xor with NumBits-1.
   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
                    DAG.getConstant(NumBits - 1, dl, OpVT));
 
   if (VT == MVT::i8)
     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   return Op;
 }
 
 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumBits = VT.getScalarSizeInBits();
   SDLoc dl(Op);
 
   if (VT.isVector()) {
     SDValue N0 = Op.getOperand(0);
     SDValue Zero = DAG.getConstant(0, dl, VT);
 
     // lsb(x) = (x & -x)
     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
 
     // cttz_undef(x) = (width - 1) - ctlz(lsb)
     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
     }
 
     // cttz(x) = ctpop(lsb - 1)
     SDValue One = DAG.getConstant(1, dl, VT);
     return DAG.getNode(ISD::CTPOP, dl, VT,
                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
   }
 
   assert(Op.getOpcode() == ISD::CTTZ &&
          "Only scalar CTTZ requires custom lowering");
 
   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
 
   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   SDValue Ops[] = {
     Op,
     DAG.getConstant(NumBits, dl, VT),
     DAG.getConstant(X86::COND_E, dl, MVT::i8),
     Op.getValue(1)
   };
   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
 }
 
 /// Break a 256-bit integer operation into two new 128-bit ones and then
 /// concatenate the result back.
 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   assert(VT.is256BitVector() && VT.isInteger() &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
   SDLoc dl(Op);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
 
   MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
 /// Break a 512-bit integer operation into two new 256-bit ones and then
 /// concatenate the result back.
 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   assert(VT.is512BitVector() && VT.isInteger() &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
   SDLoc dl(Op);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
 
   MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT.getScalarType() == MVT::i1)
     return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
                        Op.getOperand(0), Op.getOperand(1));
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
 
 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
     // Since X86 does not have CMOV for 8-bit integer, we don't convert
     // 8-bit integer abs to NEG and CMOV.
     SDLoc DL(Op);
     SDValue N0 = Op.getOperand(0);
     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
                               DAG.getConstant(0, DL, VT), N0);
     SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
                      SDValue(Neg.getNode(), 1)};
     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
   }
 
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntUnary(Op, DAG);
 }
 
 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
 
 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
                         SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   if (VT.getScalarType() == MVT::i1)
     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntArith(Op, DAG);
 
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
   // vector pairs, multiply and truncate.
   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
     if (Subtarget.hasInt256()) {
       // For 512-bit vectors, split into 256-bit vectors to allow the
       // sign-extension to occur.
       if (VT == MVT::v64i8)
         return Lower512IntArith(Op, DAG);
 
       // For 256-bit vectors, split into 128-bit vectors to allow the
       // sign-extension to occur. We don't need this on AVX512BW as we can
       // safely sign-extend to v32i16.
       if (VT == MVT::v32i8 && !Subtarget.hasBWI())
         return Lower256IntArith(Op, DAG);
 
       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
       return DAG.getNode(
           ISD::TRUNCATE, dl, VT,
           DAG.getNode(ISD::MUL, dl, ExVT,
                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
     }
 
     assert(VT == MVT::v16i8 &&
            "Pre-AVX2 support only supports v16i8 multiplication");
     MVT ExVT = MVT::v8i16;
 
     // Extract the lo parts and sign extend to i16
     SDValue ALo, BLo;
     if (Subtarget.hasSSE41()) {
       ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
       BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
     } else {
       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                               -1, 4, -1, 5, -1, 6, -1, 7};
       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
       ALo = DAG.getBitcast(ExVT, ALo);
       BLo = DAG.getBitcast(ExVT, BLo);
       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
     }
 
     // Extract the hi parts and sign extend to i16
     SDValue AHi, BHi;
     if (Subtarget.hasSSE41()) {
       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
                               -1, -1, -1, -1, -1, -1, -1, -1};
       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
       AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
       BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
     } else {
       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                               -1, 12, -1, 13, -1, 14, -1, 15};
       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
       AHi = DAG.getBitcast(ExVT, AHi);
       BHi = DAG.getBitcast(ExVT, BHi);
       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
     }
 
     // Multiply, mask the lower 8bits of the lo/hi results and pack
     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   }
 
   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   if (VT == MVT::v4i32) {
     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
            "Should not custom lower when pmulld is available!");
 
     // If the upper 17 bits of each element are zero then we can use PMADD.
     APInt Mask17 = APInt::getHighBitsSet(32, 17);
     if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
       return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
                          DAG.getBitcast(MVT::v8i16, A),
                          DAG.getBitcast(MVT::v8i16, B));
 
     // Extract the odd parts.
     static const int UnpackMask[] = { 1, -1, 3, -1 };
     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
 
     // Multiply the even parts.
     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
     // Now multiply odd parts.
     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
 
     Evens = DAG.getBitcast(VT, Evens);
     Odds = DAG.getBitcast(VT, Odds);
 
     // Merge the two vectors back together with a shuffle. This expands into 2
     // shuffles.
     static const int ShufMask[] = { 0, 4, 2, 6 };
     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   }
 
   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
          "Only know how to lower V2I64/V4I64/V8I64 multiply");
 
   // 32-bit vector types used for MULDQ/MULUDQ.
   MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
 
   // MULDQ returns the 64-bit result of the signed multiplication of the lower
   // 32-bits. We can lower with this if the sign bits stretch that far.
   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
       DAG.ComputeNumSignBits(B) > 32) {
     return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
                        DAG.getBitcast(MulVT, B));
   }
 
   //  Ahi = psrlqi(a, 32);
   //  Bhi = psrlqi(b, 32);
   //
   //  AloBlo = pmuludq(a, b);
   //  AloBhi = pmuludq(a, Bhi);
   //  AhiBlo = pmuludq(Ahi, b);
   //
   //  Hi = psllqi(AloBhi + AhiBlo, 32);
   //  return AloBlo + Hi;
   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
   bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
   bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
 
   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
   bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
   bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
 
   // If DQI is supported we can use MULLQ, but MULUDQ is still better if the
   // the high bits are known to be zero.
   if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
     return Op;
 
   // Bit cast to 32-bit vectors for MULUDQ.
   SDValue Alo = DAG.getBitcast(MulVT, A);
   SDValue Blo = DAG.getBitcast(MulVT, B);
 
   SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
 
   // Only multiply lo/hi halves that aren't known to be zero.
   SDValue AloBlo = Zero;
   if (!ALoIsZero && !BLoIsZero)
     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
 
   SDValue AloBhi = Zero;
   if (!ALoIsZero && !BHiIsZero) {
     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
     Bhi = DAG.getBitcast(MulVT, Bhi);
     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
   }
 
   SDValue AhiBlo = Zero;
   if (!AHiIsZero && !BLoIsZero) {
     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
     Ahi = DAG.getBitcast(MulVT, Ahi);
     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
   }
 
   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
 
   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
 }
 
 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                          SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntArith(Op, DAG);
 
   // Only i8 vectors should need custom lowering after this.
   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
          "Unsupported vector type");
 
   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
   // logical shift down the upper half and pack back to i8.
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
   // and then ashr/lshr the upper bits down to the lower bits before multiply.
   unsigned Opcode = Op.getOpcode();
   unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
   unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
 
   // For 512-bit vectors, split into 256-bit vectors to allow the
   // sign-extension to occur.
   if (VT == MVT::v64i8)
     return Lower512IntArith(Op, DAG);
 
   // AVX2 implementations - extend xmm subvectors to ymm.
   if (Subtarget.hasInt256()) {
     unsigned NumElems = VT.getVectorNumElements();
     SDValue Lo = DAG.getIntPtrConstant(0, dl);
     SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
 
     if (VT == MVT::v32i8) {
       if (Subtarget.hasBWI()) {
         SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
         SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
         SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
         Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
                           DAG.getConstant(8, dl, MVT::v32i16));
         return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
       }
       SDValue ALo = extract128BitVector(A, 0, DAG, dl);
       SDValue BLo = extract128BitVector(B, 0, DAG, dl);
       SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
       SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
       ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
       BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
       AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
       BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
       Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
                        DAG.getConstant(8, dl, MVT::v16i16));
       Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
                        DAG.getConstant(8, dl, MVT::v16i16));
       // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
       // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
       const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
                             16, 17, 18, 19, 20, 21, 22, 23};
       const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
                             24, 25, 26, 27, 28, 29, 30, 31};
       return DAG.getNode(X86ISD::PACKUS, dl, VT,
                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
     }
 
     assert(VT == MVT::v16i8 && "Unexpected VT");
 
     SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
     SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
     Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
                       DAG.getConstant(8, dl, MVT::v16i16));
     // If we have BWI we can use truncate instruction.
     if (Subtarget.hasBWI())
       return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   }
 
   assert(VT == MVT::v16i8 &&
          "Pre-AVX2 support only supports v16i8 multiplication");
   MVT ExVT = MVT::v8i16;
   unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
 
   // Extract the lo parts and zero/sign extend to i16.
   SDValue ALo, BLo;
   if (Subtarget.hasSSE41()) {
     ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
     BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
   } else {
     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                             -1, 4, -1, 5, -1, 6, -1, 7};
     ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
     BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
     ALo = DAG.getBitcast(ExVT, ALo);
     BLo = DAG.getBitcast(ExVT, BLo);
     ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
     BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
   }
 
   // Extract the hi parts and zero/sign extend to i16.
   SDValue AHi, BHi;
   if (Subtarget.hasSSE41()) {
     const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
                             -1, -1, -1, -1, -1, -1, -1, -1};
     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
     AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
     BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
   } else {
     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                             -1, 12, -1, 13, -1, 14, -1, 15};
     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
     AHi = DAG.getBitcast(ExVT, AHi);
     BHi = DAG.getBitcast(ExVT, BHi);
     AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
     BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
   }
 
   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
   // pack back to v16i8.
   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
   RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
   RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
 }
 
 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget.isTargetWin64() && "Unexpected target");
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
          "Unexpected return type for lowering");
 
   RTLIB::Libcall LC;
   bool isSigned;
   switch (Op->getOpcode()) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
   }
 
   SDLoc dl(Op);
   SDValue InChain = DAG.getEntryNode();
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
     EVT ArgVT = Op->getOperand(i).getValueType();
     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
            "Unexpected argument type for lowering");
     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
     Entry.Node = StackPtr;
     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
                            MachinePointerInfo(), /* Alignment = */ 16);
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Ty = PointerType::get(ArgTy,0);
     Entry.IsSExt = false;
     Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(InChain)
       .setLibCallee(
           getLibcallCallingConv(LC),
           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
           std::move(Args))
       .setInRegister()
       .setSExtResult(isSigned)
       .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return DAG.getBitcast(VT, CallInfo.first);
 }
 
 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
   MVT VT = Op0.getSimpleValueType();
   SDLoc dl(Op);
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
     unsigned Opcode = Op.getOpcode();
     unsigned NumElems = VT.getVectorNumElements();
     MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
     SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
     SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
     SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
     SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
     SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
     SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
     SDValue Ops[] = {
       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
     };
     return DAG.getMergeValues(Ops, dl);
   }
 
   assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
          (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
          (VT == MVT::v16i32 && Subtarget.hasAVX512()));
 
   int NumElts = VT.getVectorNumElements();
 
   // PMULxD operations multiply each even value (starting at 0) of LHS with
   // the related value of RHS and produce a widen result.
   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   // => <2 x i64> <ae|cg>
   //
   // In other word, to have all the results, we need to perform two PMULxD:
   // 1. one with the even values.
   // 2. one with the odd values.
   // To achieve #2, with need to place the odd values at an even position.
   //
   // Place the odd value at an even position (basically, shift all values 1
   // step to the left):
   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
   // <a|b|c|d> => <b|undef|d|undef>
   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
                                       makeArrayRef(&Mask[0], NumElts));
   // <e|f|g|h> => <f|undef|h|undef>
   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
                                       makeArrayRef(&Mask[0], NumElts));
 
   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
   // ints.
   MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
   unsigned Opcode =
       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
   // => <2 x i64> <ae|cg>
   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
   // => <2 x i64> <bf|dh>
   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
 
   // Shuffle it back into the right order.
   SmallVector<int, 16> HighMask(NumElts);
   SmallVector<int, 16> LowMask(NumElts);
   for (int i = 0; i != NumElts; ++i) {
     HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
     LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
   }
 
   SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
   SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
 
   // If we have a signed multiply but no PMULDQ fix up the high parts of a
   // unsigned multiply.
   if (IsSigned && !Subtarget.hasSSE41()) {
     SDValue ShAmt = DAG.getConstant(
         31, dl,
         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
 
     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   }
 
   // The first result of MUL_LOHI is actually the low value, followed by the
   // high value.
   SDValue Ops[] = {Lows, Highs};
   return DAG.getMergeValues(Ops, dl);
 }
 
 // Return true if the required (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
                                         unsigned Opcode) {
   if (VT.getScalarSizeInBits() < 16)
     return false;
 
   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
     return true;
 
   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
                 (VT.is256BitVector() && Subtarget.hasInt256());
 
   bool AShift = LShift && (Subtarget.hasAVX512() ||
                            (VT != MVT::v2i64 && VT != MVT::v4i64));
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
 // The shift amount is a variable, but it is the same for all vector lanes.
 // These instructions are defined together with shift-immediate.
 static
 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
                                       unsigned Opcode) {
   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
 }
 
 // Return true if the required (according to Opcode) variable-shift form is
 // natively supported by the Subtarget
 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
                                     unsigned Opcode) {
 
   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
     return false;
 
   // vXi16 supported only on AVX-512, BWI
   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
     return false;
 
   if (Subtarget.hasAVX512())
     return true;
 
   bool LShift = VT.is128BitVector() || VT.is256BitVector();
   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
 
   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
 
   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
     SDValue Ex = DAG.getBitcast(ExVT, R);
 
     // ashr(R, 63) === cmp_slt(R, 0)
     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
              "Unsupported PCMPGT op");
       return DAG.getNode(X86ISD::PCMPGT, dl, VT,
                          getZeroVector(VT, Subtarget, DAG, dl), R);
     }
 
     if (ShiftAmt >= 32) {
       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
       SDValue Upper =
           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
                                                  ShiftAmt - 32, DAG);
       if (VT == MVT::v2i64)
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
       if (VT == MVT::v4i64)
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
                                   {9, 1, 11, 3, 13, 5, 15, 7});
     } else {
       // SRA upper i32, SHL whole i64 and select lower i32.
       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
                                                  ShiftAmt, DAG);
       SDValue Lower =
           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
       Lower = DAG.getBitcast(ExVT, Lower);
       if (VT == MVT::v2i64)
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
       if (VT == MVT::v4i64)
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
                                   {8, 1, 10, 3, 12, 5, 14, 7});
     }
     return DAG.getBitcast(VT, Ex);
   };
 
   // Optimize shl/srl/sra with constant shift amount.
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
       uint64_t ShiftAmt = ShiftConst->getZExtValue();
 
       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
       // i64 SRA needs to be performed as partial shifts.
       if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
            (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
           Op.getOpcode() == ISD::SRA)
         return ArithmeticShiftRight64(ShiftAmt);
 
       if (VT == MVT::v16i8 ||
           (Subtarget.hasInt256() && VT == MVT::v32i8) ||
           VT == MVT::v64i8) {
         unsigned NumElts = VT.getVectorNumElements();
         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
         // Simple i8 add case
         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
           return DAG.getNode(ISD::ADD, dl, VT, R, R);
 
         // ashr(R, 7)  === cmp_slt(R, 0)
         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
           if (VT.is512BitVector()) {
             assert(VT == MVT::v64i8 && "Unexpected element type!");
             SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
           }
           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
         }
 
         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
         if (VT == MVT::v16i8 && Subtarget.hasXOP())
           return SDValue();
 
         if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
                                                    R, ShiftAmt, DAG);
           SHL = DAG.getBitcast(VT, SHL);
           // Zero out the rightmost bits.
           return DAG.getNode(ISD::AND, dl, VT, SHL,
                              DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
                                                    R, ShiftAmt, DAG);
           SRL = DAG.getBitcast(VT, SRL);
           // Zero out the leftmost bits.
           return DAG.getNode(ISD::AND, dl, VT, SRL,
                              DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
         }
         if (Op.getOpcode() == ISD::SRA) {
           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
 
           SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
         }
         llvm_unreachable("Unknown shift opcode.");
       }
     }
   }
 
   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
   // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
   if (!Subtarget.hasXOP() &&
       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
        (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
 
     // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
     unsigned SubVectorScale = 1;
     if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
       SubVectorScale =
           Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
       Amt = Amt.getOperand(0);
     }
 
     // Peek through any splat that was introduced for i64 shift vectorization.
     int SplatIndex = -1;
     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
       if (SVN->isSplat()) {
         SplatIndex = SVN->getSplatIndex();
         Amt = Amt.getOperand(0);
         assert(SplatIndex < (int)VT.getVectorNumElements() &&
                "Splat shuffle referencing second operand");
       }
 
     if (Amt.getOpcode() != ISD::BITCAST ||
         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
       return SDValue();
 
     Amt = Amt.getOperand(0);
     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
                      (SubVectorScale * VT.getVectorNumElements());
     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
     uint64_t ShiftAmt = 0;
     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
     for (unsigned i = 0; i != Ratio; ++i) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
       if (!C)
         return SDValue();
       // 6 == Log2(64)
       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
     }
 
     // Check remaining shift amounts (if not a splat).
     if (SplatIndex < 0) {
       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
         uint64_t ShAmt = 0;
         for (unsigned j = 0; j != Ratio; ++j) {
           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
           if (!C)
             return SDValue();
           // 6 == Log2(64)
           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
         }
         if (ShAmt != ShiftAmt)
           return SDValue();
       }
     }
 
     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
     if (Op.getOpcode() == ISD::SRA)
       return ArithmeticShiftRight64(ShiftAmt);
   }
 
   return SDValue();
 }
 
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
 
   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
 
   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
 
   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
     SDValue BaseShAmt;
     MVT EltVT = VT.getVectorElementType();
 
     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
       // Check if this build_vector node is doing a splat.
       // If so, then set BaseShAmt equal to the splat value.
       BaseShAmt = BV->getSplatValue();
       if (BaseShAmt && BaseShAmt.isUndef())
         BaseShAmt = SDValue();
     } else {
       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
         Amt = Amt.getOperand(0);
 
       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
       if (SVN && SVN->isSplat()) {
         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
         SDValue InVec = Amt.getOperand(0);
         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
           assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
                  "Unexpected shuffle index found!");
           BaseShAmt = InVec.getOperand(SplatIdx);
         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
            if (ConstantSDNode *C =
                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
              if (C->getZExtValue() == SplatIdx)
                BaseShAmt = InVec.getOperand(1);
            }
         }
 
         if (!BaseShAmt)
           // Avoid introducing an extract element from a shuffle.
           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
                                   DAG.getIntPtrConstant(SplatIdx, dl));
       }
     }
 
     if (BaseShAmt.getNode()) {
       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
       else if (EltVT.bitsLT(MVT::i32))
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
 
       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
     }
   }
 
   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
   if (VT == MVT::v2i64  && Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
                      VT.getVectorNumElements();
     std::vector<SDValue> Vals(Ratio);
     for (unsigned i = 0; i != Ratio; ++i)
       Vals[i] = Amt.getOperand(i);
     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
       for (unsigned j = 0; j != Ratio; ++j)
         if (Vals[j] != Amt.getOperand(i + j))
           return SDValue();
     }
 
     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
   }
   return SDValue();
 }
 
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
 
   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
 
   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
     return V;
 
   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
     return V;
 
   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
     return Op;
 
   // XOP has 128-bit variable logical/arithmetic shifts.
   // +ve/-ve Amt = shift left/right.
   if (Subtarget.hasXOP() &&
       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
        VT == MVT::v8i16 || VT == MVT::v16i8)) {
     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
       SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
     }
     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
     if (Op.getOpcode() == ISD::SRA)
       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
   }
 
   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
   // shifts per-lane and then shuffle the partial results back together.
   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
     // Splat the shift amounts so the scalar shifts above will catch it.
     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
   }
 
   // i64 vector arithmetic shift can be emulated with the transform:
   // M = lshr(SIGN_MASK, Amt)
   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
       Op.getOpcode() == ISD::SRA) {
     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
     return R;
   }
 
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // Do this only if the vector shift count is a constant build_vector.
   if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
        (Subtarget.hasInt256() && VT == MVT::v16i16))) {
     SmallVector<SDValue, 8> Elts;
     MVT SVT = VT.getVectorElementType();
     unsigned SVTBits = SVT.getSizeInBits();
     APInt One(SVTBits, 1);
     unsigned NumElems = VT.getVectorNumElements();
 
     for (unsigned i=0; i !=NumElems; ++i) {
       SDValue Op = Amt->getOperand(i);
       if (Op->isUndef()) {
         Elts.push_back(Op);
         continue;
       }
 
       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
       uint64_t ShAmt = C.getZExtValue();
       if (ShAmt >= SVTBits) {
         Elts.push_back(DAG.getUNDEF(SVT));
         continue;
       }
       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
     }
     SDValue BV = DAG.getBuildVector(VT, dl, Elts);
     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
   }
 
   // Lower SHL with variable shift amount.
   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
 
     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
                      DAG.getConstant(0x3f800000U, dl, VT));
     Op = DAG.getBitcast(MVT::v4f32, Op);
     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   }
 
   // If possible, lower this shift as a sequence of two shifts by
   // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
   // Example:
   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
   //
   // Could be rewritten as:
   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
   //
   // The advantage is that the two shifts from the example would be
   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
   // the vector shift into four scalar shifts plus four pairs of vector
   // insert/extract.
   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
     bool UseMOVSD = false;
     bool CanBeSimplified;
     // The splat value for the first packed shift (the 'X' from the example).
     SDValue Amt1 = Amt->getOperand(0);
     // The splat value for the second packed shift (the 'Y' from the example).
     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
 
     // See if it is possible to replace this node with a sequence of
     // two shifts followed by a MOVSS/MOVSD/PBLEND.
     if (VT == MVT::v4i32) {
       // Check if it is legal to use a MOVSS.
       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
                         Amt2 == Amt->getOperand(3);
       if (!CanBeSimplified) {
         // Otherwise, check if we can still simplify this node using a MOVSD.
         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
                           Amt->getOperand(2) == Amt->getOperand(3);
         UseMOVSD = true;
         Amt2 = Amt->getOperand(2);
       }
     } else {
       // Do similar checks for the case where the machine value type
       // is MVT::v8i16.
       CanBeSimplified = Amt1 == Amt->getOperand(1);
       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
         CanBeSimplified = Amt2 == Amt->getOperand(i);
 
       if (!CanBeSimplified) {
         UseMOVSD = true;
         CanBeSimplified = true;
         Amt2 = Amt->getOperand(4);
         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
           CanBeSimplified = Amt1 == Amt->getOperand(i);
         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
           CanBeSimplified = Amt2 == Amt->getOperand(j);
       }
     }
 
     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
         isa<ConstantSDNode>(Amt2)) {
       // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
       SDValue Splat1 =
           DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
       SDValue Splat2 =
           DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
       SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
       SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
       if (UseMOVSD)
         return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
                                                        BitCast2, {0, 1, 6, 7}));
       return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
                                                      BitCast2, {0, 5, 6, 7}));
     }
   }
 
   // v4i32 Non Uniform Shifts.
   // If the shift amount is constant we can shift each lane using the SSE2
   // immediate shifts, else we need to zero-extend each lane to the lower i64
   // and shift using the SSE2 variable shifts.
   // The separate results can then be blended together.
   if (VT == MVT::v4i32) {
     unsigned Opc = Op.getOpcode();
     SDValue Amt0, Amt1, Amt2, Amt3;
     if (ConstantAmt) {
       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
     } else {
       // ISD::SHL is handled above but we include it here for completeness.
       switch (Opc) {
       default:
         llvm_unreachable("Unknown target vector shift node");
       case ISD::SHL:
         Opc = X86ISD::VSHL;
         break;
       case ISD::SRL:
         Opc = X86ISD::VSRL;
         break;
       case ISD::SRA:
         Opc = X86ISD::VSRA;
         break;
       }
       // The SSE2 shifts use the lower i64 as the same shift amount for
       // all lanes and the upper i64 is ignored. These shuffle masks
       // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
       SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
     }
 
     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
     SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
     SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
     SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
     SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
   }
 
   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
   // make the existing SSE solution better.
   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
       (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
       (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
       (Subtarget.hasBWI() && VT == MVT::v32i8)) {
     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
            "Unexpected vector type");
     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
     unsigned ExtOpc =
         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
     return DAG.getNode(ISD::TRUNCATE, dl, VT,
                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
   }
 
   if (VT == MVT::v16i8 ||
       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
     unsigned ShiftOpcode = Op->getOpcode();
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
       if (VT.is512BitVector()) {
         // On AVX512BW targets we make use of the fact that VSELECT lowers
         // to a masked blend which selects bytes based just on the sign bit
         // extracted to a mask.
         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
         Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       } else if (Subtarget.hasSSE41()) {
         // On SSE41 targets we make use of the fact that VSELECT lowers
         // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
       return DAG.getSelect(dl, SelVT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
     // We can safely do this using i16 shifts as we're only interested in
     // the 3 lower bits of each byte.
     Amt = DAG.getBitcast(ExtVT, Amt);
     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
     Amt = DAG.getBitcast(VT, Amt);
 
     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
       // r = VSELECT(r, shift(r, 4), a);
       SDValue M =
           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
       R = SignBitSelect(VT, Amt, M, R);
 
       // a += a
       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
       // r = VSELECT(r, shift(r, 2), a);
       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
       R = SignBitSelect(VT, Amt, M, R);
 
       // a += a
       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
       // return VSELECT(r, shift(r, 1), a);
       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
       R = SignBitSelect(VT, Amt, M, R);
       return R;
     }
 
     if (Op->getOpcode() == ISD::SRA) {
       // For SRA we need to unpack each byte to the higher byte of a i16 vector
       // so we can correctly sign extend. We don't care what happens to the
       // lower byte.
       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
       ALo = DAG.getBitcast(ExtVT, ALo);
       AHi = DAG.getBitcast(ExtVT, AHi);
       RLo = DAG.getBitcast(ExtVT, RLo);
       RHi = DAG.getBitcast(ExtVT, RHi);
 
       // r = VSELECT(r, shift(r, 4), a);
       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
                                 DAG.getConstant(4, dl, ExtVT));
       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
                                 DAG.getConstant(4, dl, ExtVT));
       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
 
       // a += a
       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
 
       // r = VSELECT(r, shift(r, 2), a);
       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
                         DAG.getConstant(2, dl, ExtVT));
       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
                         DAG.getConstant(2, dl, ExtVT));
       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
 
       // a += a
       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
 
       // r = VSELECT(r, shift(r, 1), a);
       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
                         DAG.getConstant(1, dl, ExtVT));
       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
                         DAG.getConstant(1, dl, ExtVT));
       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
 
       // Logical shift the result back to the lower byte, leaving a zero upper
       // byte
       // meaning that we can safely pack with PACKUSWB.
       RLo =
           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
       RHi =
           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
     }
   }
 
   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
     MVT ExtVT = MVT::v8i32;
     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
     ALo = DAG.getBitcast(ExtVT, ALo);
     AHi = DAG.getBitcast(ExtVT, AHi);
     RLo = DAG.getBitcast(ExtVT, RLo);
     RHi = DAG.getBitcast(ExtVT, RHi);
     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   }
 
   if (VT == MVT::v8i16) {
     unsigned ShiftOpcode = Op->getOpcode();
 
     // If we have a constant shift amount, the non-SSE41 path is best as
     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
     bool UseSSE41 = Subtarget.hasSSE41() &&
                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
 
     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
       // On SSE41 targets we make use of the fact that VSELECT lowers
       // to PBLENDVB which selects bytes based just on the sign bit.
       if (UseSSE41) {
         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
         V0 = DAG.getBitcast(ExtVT, V0);
         V1 = DAG.getBitcast(ExtVT, V1);
         Sel = DAG.getBitcast(ExtVT, Sel);
         return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we splat the sign bit - a negative value will
       // set all bits of the lanes to true and VSELECT uses that in
       // its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue C =
           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
       return DAG.getSelect(dl, VT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
     if (UseSSE41) {
       // On SSE41 targets we need to replicate the shift mask in both
       // bytes for PBLENDVB.
       Amt = DAG.getNode(
           ISD::OR, dl, VT,
           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
     } else {
       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
     }
 
     // r = VSELECT(r, shift(r, 8), a);
     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
     R = SignBitSelect(Amt, M, R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
     // r = VSELECT(r, shift(r, 4), a);
     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
     R = SignBitSelect(Amt, M, R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
     // r = VSELECT(r, shift(r, 2), a);
     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
     R = SignBitSelect(Amt, M, R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
     // return VSELECT(r, shift(r, 1), a);
     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
     R = SignBitSelect(Amt, M, R);
     return R;
   }
 
   // Decompose 256-bit shifts into smaller 128-bit shifts.
   if (VT.is256BitVector())
     return Lower256IntArith(Op, DAG);
 
   return SDValue();
 }
 
 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   unsigned Opcode = Op.getOpcode();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
 
   if (Subtarget.hasAVX512()) {
     // Attempt to rotate by immediate.
     APInt UndefElts;
     SmallVector<APInt, 16> EltBits;
     if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
       if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
             return EltBits[0] == V;
           })) {
         unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
         uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
         return DAG.getNode(Op, DL, VT, R,
                            DAG.getConstant(RotateAmt, DL, MVT::i8));
       }
     }
 
     // Else, fall-back on VPROLV/VPRORV.
     return Op;
   }
 
   assert(VT.isVector() && "Custom lowering only for vector rotates!");
   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
 
   // Split 256-bit integers.
   if (VT.is256BitVector())
     return Lower256IntArith(Op, DAG);
 
   assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
 
   // Attempt to rotate by immediate.
   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
       assert(RotateAmt < EltSizeInBits && "Rotation out of range");
       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
                          DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
   }
 
   // Use general rotate by variable (per-element).
   return Op;
 }
 
 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   // looks for this combo and may remove the "setcc" instruction if the "setcc"
   // has only one use.
   SDNode *N = Op.getNode();
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   unsigned BaseOp = 0;
   X86::CondCode Cond;
   SDLoc DL(Op);
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
     // A subtract of one will be selected as a INC. Note that INC doesn't
     // set CF, so we can't do this for UADDO.
     if (isOneConstant(RHS)) {
       BaseOp = X86ISD::INC;
       Cond = X86::COND_O;
       break;
     }
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_O;
     break;
   case ISD::UADDO:
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_B;
     break;
   case ISD::SSUBO:
     // A subtract of one will be selected as a DEC. Note that DEC doesn't
     // set CF, so we can't do this for USUBO.
     if (isOneConstant(RHS)) {
       BaseOp = X86ISD::DEC;
       Cond = X86::COND_O;
       break;
     }
     BaseOp = X86ISD::SUB;
     Cond = X86::COND_O;
     break;
   case ISD::USUBO:
     BaseOp = X86ISD::SUB;
     Cond = X86::COND_B;
     break;
   case ISD::SMULO:
     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
     Cond = X86::COND_O;
     break;
   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
     if (N->getValueType(0) == MVT::i8) {
       BaseOp = X86ISD::UMUL8;
       Cond = X86::COND_O;
       break;
     }
     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
                                  MVT::i32);
     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
 
     SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
 
     if (N->getValueType(1) == MVT::i1)
       SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
 
     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   }
   }
 
   // Also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
 
   SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
 
   if (N->getValueType(1) == MVT::i1)
     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
 
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
 /// Returns true if the operand type is exactly twice the native width, and
 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
 
   if (OpWidth == 64)
     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
   else if (OpWidth == 128)
     return Subtarget.hasCmpxchg16b();
   else
     return false;
 }
 
 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   return needsCmpXchgNb(SI->getValueOperand()->getType());
 }
 
 // Note: this turns large loads into lock cmpxchg8b/16b.
 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   auto PTy = cast<PointerType>(LI->getPointerOperandType());
   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
                                                : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   Type *MemType = AI->getType();
 
   // If the operand is too big, we must see if cmpxchg8/16b is available
   // and default to library calls otherwise.
   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
                                    : AtomicExpansionKind::None;
   }
 
   AtomicRMWInst::BinOp Op = AI->getOperation();
   switch (Op) {
   default:
     llvm_unreachable("Unknown atomic operation");
   case AtomicRMWInst::Xchg:
   case AtomicRMWInst::Add:
   case AtomicRMWInst::Sub:
     // It's better to use xadd, xsub or xchg for these in all cases.
     return AtomicExpansionKind::None;
   case AtomicRMWInst::Or:
   case AtomicRMWInst::And:
   case AtomicRMWInst::Xor:
     // If the atomicrmw's result isn't actually used, we can just add a "lock"
     // prefix to a normal instruction for these operations.
     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
                             : AtomicExpansionKind::None;
   case AtomicRMWInst::Nand:
   case AtomicRMWInst::Max:
   case AtomicRMWInst::Min:
   case AtomicRMWInst::UMax:
   case AtomicRMWInst::UMin:
     // These always require a non-trivial set of data operations on x86. We must
     // use a cmpxchg loop.
     return AtomicExpansionKind::CmpXChg;
   }
 }
 
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   Type *MemType = AI->getType();
   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   // there is no benefit in turning such RMWs into loads, and it is actually
   // harmful as it introduces a mfence.
   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
     return nullptr;
 
   auto Builder = IRBuilder<>(AI);
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   auto SSID = AI->getSyncScopeID();
   // We must restrict the ordering to avoid generating loads with Release or
   // ReleaseAcquire orderings.
   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
   auto Ptr = AI->getPointerOperand();
 
   // Before the load we need a fence. Here is an example lifted from
   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
   // is required:
   // Thread 0:
   //   x.store(1, relaxed);
   //   r1 = y.fetch_add(0, release);
   // Thread 1:
   //   y.fetch_add(42, acquire);
   //   r2 = x.load(relaxed);
   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
   // lowered to just a load without a fence. A mfence flushes the store buffer,
   // making the optimization clearly correct.
   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
   // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
   if (SSID == SyncScope::SingleThread)
     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
     // the IR level, so we must wrap it in an intrinsic.
     return nullptr;
 
   if (!Subtarget.hasMFence())
     // FIXME: it might make sense to use a locked operation here but on a
     // different cache-line to prevent cache-line bouncing. In practice it
     // is probably a small win, and x86 processors without mfence are rare
     // enough that we do not bother.
     return nullptr;
 
   Function *MFence =
       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
   Builder.CreateCall(MFence, {});
 
   // Finally we can emit the atomic load.
   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
           AI->getType()->getPrimitiveSizeInBits());
   Loaded->setAtomic(Order, SSID);
   AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
   return Loaded;
 }
 
 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   SDLoc dl(Op);
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
 
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
       FenceSSID == SyncScope::System) {
     if (Subtarget.hasMFence())
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
     SDValue Chain = Op.getOperand(0);
     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
     SDValue Ops[] = {
       DAG.getRegister(X86::ESP, MVT::i32),     // Base
       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
       DAG.getRegister(0, MVT::i32),            // Index
       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
       DAG.getRegister(0, MVT::i32),            // Segment.
       Zero,
       Chain
     };
     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
     return SDValue(Res, 0);
   }
 
   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
 }
 
 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
   MVT T = Op.getSimpleValueType();
   SDLoc DL(Op);
   unsigned Reg = 0;
   unsigned size = 0;
   switch(T.SimpleTy) {
   default: llvm_unreachable("Invalid value type!");
   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   case MVT::i16: Reg = X86::AX;  size = 2; break;
   case MVT::i32: Reg = X86::EAX; size = 4; break;
   case MVT::i64:
     assert(Subtarget.is64Bit() && "Node not type legal!");
     Reg = X86::RAX; size = 8;
     break;
   }
   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
                                   Op.getOperand(2), SDValue());
   SDValue Ops[] = { cpIn.getValue(0),
                     Op.getOperand(1),
                     Op.getOperand(3),
                     DAG.getTargetConstant(size, DL, MVT::i8),
                     cpIn.getValue(1) };
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
                                            Ops, T, MMO);
 
   SDValue cpOut =
     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
                                       MVT::i32, cpOut.getValue(2));
   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
 
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
   return SDValue();
 }
 
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
       SrcVT == MVT::i64) {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     if (DstVT != MVT::f64)
       // This conversion needs to be expanded.
       return SDValue();
 
     SDValue Op0 = Op->getOperand(0);
     SmallVector<SDValue, 16> Elts;
     SDLoc dl(Op);
     unsigned NumElts;
     MVT SVT;
     if (SrcVT.isVector()) {
       NumElts = SrcVT.getVectorNumElements();
       SVT = SrcVT.getVectorElementType();
 
       // Widen the vector in input in the case of MVT::v2i32.
       // Example: from MVT::v2i32 to MVT::v4i32.
       for (unsigned i = 0, e = NumElts; i != e; ++i)
         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
                                    DAG.getIntPtrConstant(i, dl)));
     } else {
       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
              "Unexpected source type in LowerBITCAST");
       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
                                  DAG.getIntPtrConstant(0, dl)));
       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
                                  DAG.getIntPtrConstant(1, dl)));
       NumElts = 2;
       SVT = MVT::i32;
     }
     // Explicitly mark the extra elements as Undef.
     Elts.append(NumElts, DAG.getUNDEF(SVT));
 
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
     SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
          Subtarget.hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
          "Unexpected custom BITCAST");
   // i64 <=> MMX conversions are Legal.
   if (SrcVT==MVT::i64 && DstVT.isVector())
     return Op;
   if (DstVT==MVT::i64 && SrcVT.isVector())
     return Op;
   // MMX <=> MMX conversions are Legal.
   if (SrcVT.isVector() && DstVT.isVector())
     return Op;
   // All other conversions need to be expanded.
   return SDValue();
 }
 
 /// Compute the horizontal sum of bytes in V for the elements of VT.
 ///
 /// Requires V to be a byte vector and VT to be an integer vector type with
 /// wider elements than V's type. The width of the elements of VT determines
 /// how many bytes of V are summed horizontally to produce each element of the
 /// result.
 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   SDLoc DL(V);
   MVT ByteVecVT = V.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
          "Expected value to have byte element type.");
   assert(EltVT != MVT::i8 &&
          "Horizontal byte sum only makes sense for wider elements!");
   unsigned VecSize = VT.getSizeInBits();
   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
 
   // PSADBW instruction horizontally add all bytes and leave the result in i64
   // chunks, thus directly computes the pop count for v2i64 and v4i64.
   if (EltVT == MVT::i64) {
     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
     return DAG.getBitcast(VT, V);
   }
 
   if (EltVT == MVT::i32) {
     // We unpack the low half and high half into i32s interleaved with zeros so
     // that we can use PSADBW to horizontally sum them. The most useful part of
     // this is that it lines up the results of two PSADBW instructions to be
     // two v2i64 vectors which concatenated are the 4 population counts. We can
     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
     SDValue V32 = DAG.getBitcast(VT, V);
     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
 
     // Do the horizontal sums into two v2i64s.
     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
                       DAG.getBitcast(ByteVecVT, Low), Zeros);
     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
                        DAG.getBitcast(ByteVecVT, High), Zeros);
 
     // Merge them together.
     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
                     DAG.getBitcast(ShortVecVT, Low),
                     DAG.getBitcast(ShortVecVT, High));
 
     return DAG.getBitcast(VT, V);
   }
 
   // The only element type left is i16.
   assert(EltVT == MVT::i16 && "Unknown how to handle type");
 
   // To obtain pop count for each i16 element starting from the pop count for
   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
   // right by 8. It is important to shift as i16s as i8 vector shift isn't
   // directly supported.
   SDValue ShifterV = DAG.getConstant(8, DL, VT);
   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
                   DAG.getBitcast(ByteVecVT, V));
   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
 }
 
 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   unsigned VecSize = VT.getSizeInBits();
 
   // Implement a lookup table in register by using an algorithm based on:
   // http://wm.ite.pl/articles/sse-popcount.html
   //
   // The general idea is that every lower byte nibble in the input vector is an
   // index into a in-register pre-computed pop count table. We then split up the
   // input vector in two new ones: (1) a vector with only the shifted-right
   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
   // masked out higher ones) for each byte. PSHUFB is used separately with both
   // to index the in-register table. Next, both are added and the result is a
   // i8 vector where each element contains the pop count for input byte.
   //
   // To obtain the pop count for elements != i8, we follow up with the same
   // approach and use additional tricks as described below.
   //
   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
 
   int NumByteElts = VecSize / 8;
   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
   SDValue In = DAG.getBitcast(ByteVecVT, Op);
   SmallVector<SDValue, 64> LUTVec;
   for (int i = 0; i < NumByteElts; ++i)
     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
   SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
 
   // High nibbles
   SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
 
   // Low nibbles
   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
 
   // The input vector is used as the shuffle mask that index elements into the
   // LUT. After counting low and high nibbles, add the vector to obtain the
   // final pop count per i8 element.
   SDValue HighPopCnt =
       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
   SDValue LowPopCnt =
       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
 
   if (EltVT == MVT::i8)
     return PopCnt;
 
   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
 }
 
 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   assert(VT.is128BitVector() &&
          "Only 128-bit vector bitmath lowering supported.");
 
   int VecSize = VT.getSizeInBits();
   MVT EltVT = VT.getVectorElementType();
   int Len = EltVT.getSizeInBits();
 
   // This is the vectorized version of the "best" algorithm from
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
   // with a minor tweak to use a series of adds + shifts instead of vector
   // multiplications. Implemented for all integer vector types. We only use
   // this when we don't have SSSE3 which allows a LUT-based lowering that is
   // much faster, even faster than using native popcnt instructions.
 
   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
     MVT VT = V.getSimpleValueType();
     SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
     return DAG.getNode(OpCode, DL, VT, V, ShifterV);
   };
   auto GetMask = [&](SDValue V, APInt Mask) {
     MVT VT = V.getSimpleValueType();
     SDValue MaskV = DAG.getConstant(Mask, DL, VT);
     return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
   };
 
   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
   // x86, so set the SRL type to have elements at least i16 wide. This is
   // correct because all of our SRLs are followed immediately by a mask anyways
   // that handles any bits that sneak into the high bits of the byte elements.
   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
 
   SDValue V = Op;
 
   // v = v - ((v >> 1) & 0x55555555...)
   SDValue Srl =
       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
 
   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
 
   // v = (v + (v >> 4)) & 0x0F0F0F0F...
   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
 
   // At this point, V contains the byte-wise population count, and we are
   // merely doing a horizontal sum if necessary to get the wider element
   // counts.
   if (EltVT == MVT::i8)
     return V;
 
   return LowerHorizontalByteSum(
       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
       DAG);
 }
 
 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
          "Unknown CTPOP type to handle");
   SDLoc DL(Op.getNode());
   SDValue Op0 = Op.getOperand(0);
 
   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
   if (Subtarget.hasVPOPCNTDQ()) {
     unsigned NumElems = VT.getVectorNumElements();
     assert((VT.getVectorElementType() == MVT::i8 ||
             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
     if (NumElems <= 16) {
       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
     }
   }
 
   if (!Subtarget.hasSSSE3()) {
     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
   }
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
 
   // Decompose 512-bit ops into smaller 256-bit ops.
   if (VT.is512BitVector() && !Subtarget.hasBWI())
     return Lower512IntUnary(Op, DAG);
 
   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
 }
 
 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().isVector() &&
          "We only do custom lowering for vector population count.");
   return LowerVectorCTPOP(Op, Subtarget, DAG);
 }
 
 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
   // For scalars, its still beneficial to transfer to/from the SIMD unit to
   // perform the BITREVERSE.
   if (!VT.isVector()) {
     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   int NumElts = VT.getVectorNumElements();
   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector())
     return Lower256IntUnary(Op, DAG);
 
   assert(VT.is128BitVector() &&
          "Only 128-bit vector bitreverse lowering supported.");
 
   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
   // perform the BSWAP in the shuffle.
   // Its best to shuffle using the second operand as this will implicitly allow
   // memory folding for multiple vectors.
   SmallVector<SDValue, 16> MaskElts;
   for (int i = 0; i != NumElts; ++i) {
     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
       int PermuteByte = SourceByte | (2 << 5);
       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
     }
   }
 
   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
                     Res, Mask);
   return DAG.getBitcast(VT, Res);
 }
 
 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   if (Subtarget.hasXOP() && !VT.is512BitVector())
     return LowerBITREVERSE_XOP(Op, DAG);
 
   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
 
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
   unsigned NumElts = VT.getVectorNumElements();
   assert(VT.getScalarType() == MVT::i8 &&
          "Only byte vector BITREVERSE supported");
 
   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
 
   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
   // two nibbles and a PSHUFB lookup to find the bitreverse of each
   // 0-15 value (moved to the other nibble).
   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
 
   const int LoLUT[16] = {
       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
   const int HiLUT[16] = {
       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
 
   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
   for (unsigned i = 0; i < NumElts; ++i) {
     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
   }
 
   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
 }
 
 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget,
                                         bool AllowIncDec = true) {
   unsigned NewOpc = 0;
   switch (N->getOpcode()) {
   case ISD::ATOMIC_LOAD_ADD:
     NewOpc = X86ISD::LADD;
     break;
   case ISD::ATOMIC_LOAD_SUB:
     NewOpc = X86ISD::LSUB;
     break;
   case ISD::ATOMIC_LOAD_OR:
     NewOpc = X86ISD::LOR;
     break;
   case ISD::ATOMIC_LOAD_XOR:
     NewOpc = X86ISD::LXOR;
     break;
   case ISD::ATOMIC_LOAD_AND:
     NewOpc = X86ISD::LAND;
     break;
   default:
     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
   }
 
   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
 
   if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
     // Convert to inc/dec if they aren't slow or we are optimizing for size.
     if (AllowIncDec && (!Subtarget.slowIncDec() ||
                         DAG.getMachineFunction().getFunction().optForSize())) {
       if ((NewOpc == X86ISD::LADD && C->isOne()) ||
           (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
         return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
                                        DAG.getVTList(MVT::i32, MVT::Other),
                                        {N->getOperand(0), N->getOperand(1)},
                                        /*MemVT=*/N->getSimpleValueType(0), MMO);
       if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
           (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
         return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
                                        DAG.getVTList(MVT::i32, MVT::Other),
                                        {N->getOperand(0), N->getOperand(1)},
                                        /*MemVT=*/N->getSimpleValueType(0), MMO);
     }
   }
 
   return DAG.getMemIntrinsicNode(
       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
       /*MemVT=*/N->getSimpleValueType(0), MMO);
 }
 
 /// Lower atomic_load_ops into LOCK-prefixed operations.
 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
   SDValue Chain = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   unsigned Opc = N->getOpcode();
   MVT VT = N->getSimpleValueType(0);
   SDLoc DL(N);
 
   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
   // can only be lowered when the result is unused.  They should have already
   // been transformed into a cmpxchg loop in AtomicExpand.
   if (N->hasAnyUseOfValue(0)) {
     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
     // select LXADD if LOCK_SUB can't be selected.
     if (Opc == ISD::ATOMIC_LOAD_SUB) {
       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
                            RHS, AN->getMemOperand());
     }
     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
            "Used AtomicRMW ops other than Add should have been expanded!");
     return N;
   }
 
   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
   // RAUW the chain, but don't worry about the result, as it's unused.
   assert(!N->hasAnyUseOfValue(0));
   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
   return SDValue();
 }
 
 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
   SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
   // Convert seq_cst store -> xchg
   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   //        (The only way to get a 16-byte store is cmpxchg16b)
   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
   if (cast<AtomicSDNode>(Node)->getOrdering() ==
           AtomicOrdering::SequentiallyConsistent ||
       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
                                  Node->getOperand(0),
                                  Node->getOperand(1), Node->getOperand(2),
                                  cast<AtomicSDNode>(Node)->getMemOperand());
     return Swap.getValue(1);
   }
   // Other atomic stores have a simple pattern.
   return Op;
 }
 
 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   SDNode *N = Op.getNode();
   MVT VT = N->getSimpleValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   SDLoc DL(N);
 
   // Set the carry flag.
   SDValue Carry = Op.getOperand(2);
   EVT CarryVT = Carry.getValueType();
   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
 
   unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
   SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
                             Op.getOperand(1), Carry.getValue(1));
 
   SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
   if (N->getValueType(1) == MVT::i1)
     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
 
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   // which returns the values as { float, float } (in XMM0) or
   // { double, double } (which is returned in XMM0, XMM1).
   SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
   Entry.IsSExt = false;
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
   bool isF64 = ArgVT == MVT::f64;
   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
   const char *LibcallName = TLI.getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
                       : (Type *)VectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
   if (isF64)
     // Returned in xmm0 and xmm1.
     return CallResult.first;
 
   // Returned in bits 0:31 and 32:64 xmm0.
   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
                                CallResult.first, DAG.getIntPtrConstant(0, dl));
   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
                                CallResult.first, DAG.getIntPtrConstant(1, dl));
   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
 }
 
 /// Widen a vector input to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
                             bool FillWithZeroes = false) {
   // Check if InOp already has the right width.
   MVT InVT = InOp.getSimpleValueType();
   if (InVT == NVT)
     return InOp;
 
   if (InOp.isUndef())
     return DAG.getUNDEF(NVT);
 
   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
          "input and widen element type must match");
 
   unsigned InNumElts = InVT.getVectorNumElements();
   unsigned WidenNumElts = NVT.getVectorNumElements();
   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
          "Unexpected request for vector widening");
 
   SDLoc dl(InOp);
   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
       InOp.getNumOperands() == 2) {
     SDValue N1 = InOp.getOperand(1);
     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
         N1.isUndef()) {
       InOp = InOp.getOperand(0);
       InVT = InOp.getSimpleValueType();
       InNumElts = InVT.getVectorNumElements();
     }
   }
   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
     SmallVector<SDValue, 16> Ops;
     for (unsigned i = 0; i < InNumElts; ++i)
       Ops.push_back(InOp.getOperand(i));
 
     EVT EltVT = InOp.getOperand(0).getValueType();
 
     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
       DAG.getUNDEF(EltVT);
     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
       Ops.push_back(FillVal);
     return DAG.getBuildVector(NVT, dl, Ops);
   }
   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
     DAG.getUNDEF(NVT);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
                      InOp, DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
          "MGATHER/MSCATTER are supported on AVX-512 arch only");
 
   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
   SDValue Src = N->getValue();
   MVT VT = Src.getSimpleValueType();
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
   SDLoc dl(Op);
 
   SDValue Index = N->getIndex();
   SDValue Mask = N->getMask();
   SDValue Chain = N->getChain();
   SDValue BasePtr = N->getBasePtr();
   MVT MemVT = N->getMemoryVT().getSimpleVT();
   MVT IndexVT = Index.getSimpleValueType();
   MVT MaskVT = Mask.getSimpleValueType();
 
   if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
     // The v2i32 value was promoted to v2i64.
     // Now we "redo" the type legalizer's work and widen the original
     // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
     // with a shuffle.
     assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
            "Unexpected memory type");
     int ShuffleMask[] = {0, 2, -1, -1};
     Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
                                DAG.getUNDEF(MVT::v4i32), ShuffleMask);
     // Now we have 4 elements instead of 2.
     // Expand the index.
     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
     Index = ExtendToType(Index, NewIndexVT, DAG);
 
     // Expand the mask with zeroes
     // Mask may be <2 x i64> or <2 x i1> at this moment
     assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
            "Unexpected mask type");
     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
     VT = MVT::v4i32;
   }
 
   unsigned NumElts = VT.getVectorNumElements();
   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
       !Index.getSimpleValueType().is512BitVector()) {
     // AVX512F supports only 512-bit vectors. Or data or index should
     // be 512 bit wide. If now the both index and data are 256-bit, but
     // the vector contains 8 elements, we just sign-extend the index
     if (IndexVT == MVT::v8i32)
       // Just extend index
       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
     else {
       // The minimal number of elts in scatter is 8
       NumElts = 8;
       // Index
       MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
       // Use original index here, do not modify the index twice
       Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
       if (IndexVT.getScalarType() == MVT::i32)
         Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
 
       // Mask
       // At this point we have promoted mask operand
       assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
       MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
       // Use the original mask here, do not modify the mask twice
       Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
 
       // The value that should be stored
       MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
       Src = ExtendToType(Src, NewVT, DAG);
     }
   }
   // If the mask is "wide" at this point - truncate it to i1 vector
   MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
 
   // The mask is killed by scatter, add it to the values
   SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
   SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
       VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
   return SDValue(NewScatter.getNode(), 1);
 }
 
 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
 
   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
   MVT VT = Op.getSimpleValueType();
   MVT ScalarVT = VT.getScalarType();
   SDValue Mask = N->getMask();
   SDLoc dl(Op);
 
   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
          "Expanding masked load is supported on AVX-512 target only!");
 
   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
          "Expanding masked load is supported for 32 and 64-bit types only!");
 
   // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
   // VLX. These types for exp-loads are handled here.
   if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
     return Op;
 
   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
          "Cannot lower masked load op.");
 
   assert((ScalarVT.getSizeInBits() >= 32 ||
           (Subtarget.hasBWI() &&
               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
          "Unsupported masked load op.");
 
   // This operation is legal for targets with VLX, but without
   // VLX the vector should be widened to 512 bit
   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
   SDValue Src0 = N->getSrc0();
   Src0 = ExtendToType(Src0, WideDataVT, DAG);
 
   // Mask element has to be i1.
   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
          "We handle 4x32, 4x64 and 2x64 vectors only in this case");
 
   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
 
   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   if (MaskEltTy != MVT::i1)
     Mask = DAG.getNode(ISD::TRUNCATE, dl,
                        MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
                                       N->getBasePtr(), Mask, Src0,
                                       N->getMemoryVT(), N->getMemOperand(),
                                       N->getExtensionType(),
                                       N->isExpandingLoad());
 
   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
                                NewLoad.getValue(0),
                                DAG.getIntPtrConstant(0, dl));
   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
   return DAG.getMergeValues(RetOps, dl);
 }
 
 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
   SDValue DataToStore = N->getValue();
   MVT VT = DataToStore.getSimpleValueType();
   MVT ScalarVT = VT.getScalarType();
   SDValue Mask = N->getMask();
   SDLoc dl(Op);
 
   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
          "Expanding masked load is supported on AVX-512 target only!");
 
   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
          "Expanding masked load is supported for 32 and 64-bit types only!");
 
   // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
   if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
     return Op;
 
   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
          "Cannot lower masked store op.");
 
   assert((ScalarVT.getSizeInBits() >= 32 ||
           (Subtarget.hasBWI() &&
               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
           "Unsupported masked store op.");
 
   // This operation is legal for targets with VLX, but without
   // VLX the vector should be widened to 512 bit
   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
 
   // Mask element has to be i1.
   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
          "We handle 4x32, 4x64 and 2x64 vectors only in this case");
 
   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
 
   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   if (MaskEltTy != MVT::i1)
     Mask = DAG.getNode(ISD::TRUNCATE, dl,
                        MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
                             Mask, N->getMemoryVT(), N->getMemOperand(),
                             N->isTruncatingStore(), N->isCompressingStore());
 }
 
 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
   assert(Subtarget.hasAVX2() &&
          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
 
   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue Index = N->getIndex();
   SDValue Mask = N->getMask();
   SDValue Src0 = N->getValue();
   MVT IndexVT = Index.getSimpleValueType();
   MVT MaskVT = Mask.getSimpleValueType();
 
   unsigned NumElts = VT.getVectorNumElements();
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
 
   // If the index is v2i32, we're being called by type legalization.
   if (IndexVT == MVT::v2i32)
     return SDValue();
 
   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
       !Index.getSimpleValueType().is512BitVector()) {
     // AVX512F supports only 512-bit vectors. Or data or index should
     // be 512 bit wide. If now the both index and data are 256-bit, but
     // the vector contains 8 elements, we just sign-extend the index
     if (NumElts == 8) {
       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
       SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
       SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
           DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
           N->getMemOperand());
       return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
     }
 
     // Minimal number of elements in Gather
     NumElts = 8;
     // Index
     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
     Index = ExtendToType(Index, NewIndexVT, DAG);
     if (IndexVT.getScalarType() == MVT::i32)
       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
 
     // Mask
     MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
     // At this point we have promoted mask operand
     assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
 
     // The pass-through value
     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
     Src0 = ExtendToType(Src0, NewVT, DAG);
 
     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
     SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
         DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
         N->getMemOperand());
     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
                                   NewGather.getValue(0),
                                   DAG.getIntPtrConstant(0, dl));
     SDValue RetOps[] = {Extract, NewGather.getValue(2)};
     return DAG.getMergeValues(RetOps, dl);
   }
 
   SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
   SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
       DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
       N->getMemOperand());
   return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
 }
 
 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
                                                     SelectionDAG &DAG) const {
   // TODO: Eventually, the lowering of these nodes should be informed by or
   // deferred to the GC strategy for the function in which they appear. For
   // now, however, they must be lowered to something. Since they are logically
   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   // require special handling for these nodes), lower them as literal NOOPs for
   // the time being.
   SmallVector<SDValue, 2> Ops;
 
   Ops.push_back(Op.getOperand(0));
   if (Op->getGluedNode())
     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
 
   SDLoc OpDL(Op);
   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
 
   return NOOP;
 }
 
 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
                                                   SelectionDAG &DAG) const {
   // TODO: Eventually, the lowering of these nodes should be informed by or
   // deferred to the GC strategy for the function in which they appear. For
   // now, however, they must be lowered to something. Since they are logically
   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   // require special handling for these nodes), lower them as literal NOOPs for
   // the time being.
   SmallVector<SDValue, 2> Ops;
 
   Ops.push_back(Op.getOperand(0));
   if (Op->getGluedNode())
     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
 
   SDLoc OpDL(Op);
   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
 
   return NOOP;
 }
 
 /// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     return LowerCMP_SWAP(Op, Subtarget, DAG);
   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::SHL_PARTS:
   case ISD::SRA_PARTS:
   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   case ISD::ZERO_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
   case ISD::FABS:
   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   case ISD::FRAME_TO_ARGS_OFFSET:
                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   case ISD::EH_SJLJ_SETUP_DISPATCH:
     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::MULHS:
   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   case ISD::UMUL_LOHI:
   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   case ISD::ROTL:
   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
   case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
   case ISD::ADD:
   case ISD::SUB:                return LowerADD_SUB(Op, DAG);
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
   case ISD::ABS:                return LowerABS(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   case ISD::GC_TRANSITION_START:
                                 return LowerGC_TRANSITION_START(Op, DAG);
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
   case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
   }
 }
 
 /// Places new result values for the node in Results (their number
 /// and types must exactly match those of the original return values of
 /// the node), or leaves Results empty, which indicates that the node is not
 /// to be custom lowered after all.
 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
                                               SmallVectorImpl<SDValue> &Results,
                                               SelectionDAG &DAG) const {
   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
 
   if (!Res.getNode())
     return;
 
   assert((N->getNumValues() <= Res->getNumValues()) &&
       "Lowering returned the wrong number of results!");
 
   // Places new result values base on N result number.
   // In some cases (LowerSINT_TO_FP for example) Res has more result values
   // than original node, chain should be dropped(last value).
   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
     Results.push_back(Res.getValue(I));
 }
 
 /// Replace a node with an illegal result type with a new node built out of
 /// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
   SDLoc dl(N);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
   case X86ISD::AVG: {
     // Legalize types for X86ISD::AVG by expanding vectors.
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
 
     auto InVT = N->getValueType(0);
     auto InVTSize = InVT.getSizeInBits();
     const unsigned RegSize =
         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
     assert((Subtarget.hasBWI() || RegSize < 512) &&
            "512-bit vector requires AVX512BW");
     assert((Subtarget.hasAVX2() || RegSize < 256) &&
            "256-bit vector requires AVX2");
 
     auto ElemVT = InVT.getVectorElementType();
     auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
                                   RegSize / ElemVT.getSizeInBits());
     assert(RegSize % InVT.getSizeInBits() == 0);
     unsigned NumConcat = RegSize / InVT.getSizeInBits();
 
     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
     Ops[0] = N->getOperand(0);
     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
     Ops[0] = N->getOperand(1);
     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
 
     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
     if (!ExperimentalVectorWideningLegalization)
       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
                         DAG.getIntPtrConstant(0, dl));
     Results.push_back(Res);
     return;
   }
   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   case X86ISD::FMINC:
   case X86ISD::FMIN:
   case X86ISD::FMAXC:
   case X86ISD::FMAX: {
     EVT VT = N->getValueType(0);
     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
     SDValue UNDEF = DAG.getUNDEF(VT);
     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                               N->getOperand(0), UNDEF);
     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                               N->getOperand(1), UNDEF);
     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
     return;
   }
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SREM:
   case ISD::UREM:
   case ISD::SDIVREM:
   case ISD::UDIVREM: {
     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
     Results.push_back(V);
     return;
   }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: {
     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
 
     if (N->getValueType(0) == MVT::v2i32) {
       assert((IsSigned || Subtarget.hasAVX512()) &&
              "Can only handle signed conversion without AVX512");
       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
       SDValue Src = N->getOperand(0);
       if (Src.getValueType() == MVT::v2f64) {
         MVT ResVT = MVT::v4i32;
         unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
         if (!IsSigned && !Subtarget.hasVLX()) {
           // Widen to 512-bits.
           ResVT = MVT::v8i32;
           Opc = ISD::FP_TO_UINT;
           Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
                             DAG.getUNDEF(MVT::v8f64),
                             Src, DAG.getIntPtrConstant(0, dl));
         }
         SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
         ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
                                                        : MVT::v2i32;
         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
                           DAG.getIntPtrConstant(0, dl));
         Results.push_back(Res);
         return;
       }
       if (Src.getValueType() == MVT::v2f32) {
         SDValue Idx = DAG.getIntPtrConstant(0, dl);
         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                   DAG.getUNDEF(MVT::v2f32));
         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
                                    : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
         if (!ExperimentalVectorWideningLegalization)
           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
         Results.push_back(Res);
         return;
       }
 
       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
       // so early out here.
       return;
     }
 
     std::pair<SDValue,SDValue> Vals =
         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
     SDValue FIST = Vals.first, StackSlot = Vals.second;
     if (FIST.getNode()) {
       EVT VT = N->getValueType(0);
       // Return a load from the stack slot.
       if (StackSlot.getNode())
         Results.push_back(
             DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
       else
         Results.push_back(FIST);
     }
     return;
   }
   case ISD::SINT_TO_FP: {
     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
     SDValue Src = N->getOperand(0);
     if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
       return;
     Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
     return;
   }
   case ISD::UINT_TO_FP: {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     EVT VT = N->getValueType(0);
     if (VT != MVT::v2f32)
       return;
     SDValue Src = N->getOperand(0);
     EVT SrcVT = Src.getValueType();
     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
       Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
       return;
     }
     if (SrcVT != MVT::v2i32)
       return;
     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
     SDValue VBias =
         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
                              DAG.getBitcast(MVT::v2i64, VBias));
     Or = DAG.getBitcast(MVT::v2f64, Or);
     // TODO: Are there any fast-math-flags to propagate here?
     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
     return;
   }
   case ISD::FP_ROUND: {
     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
         return;
     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
     Results.push_back(V);
     return;
   }
   case ISD::FP_EXTEND: {
     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
     // No other ValueType for FP_EXTEND should reach this point.
     assert(N->getValueType(0) == MVT::v2f32 &&
            "Do not know how to legalize this Node");
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
     switch (IntNo) {
     default : llvm_unreachable("Do not know how to custom type "
                                "legalize this intrinsic operation!");
     case Intrinsic::x86_rdtsc:
       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                      Results);
     case Intrinsic::x86_rdtscp:
       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
                                      Results);
     case Intrinsic::x86_rdpmc:
       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
 
     case Intrinsic::x86_xgetbv:
       return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
     }
   }
   case ISD::INTRINSIC_WO_CHAIN: {
     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
       Results.push_back(V);
     return;
   }
   case ISD::READCYCLECOUNTER: {
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                    Results);
   }
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     EVT T = N->getValueType(0);
     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
     bool Regs64bit = T == MVT::i128;
     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
     SDValue cpInL, cpInH;
     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
                         DAG.getConstant(0, dl, HalfT));
     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
                         DAG.getConstant(1, dl, HalfT));
     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
                              Regs64bit ? X86::RAX : X86::EAX,
                              cpInL, SDValue());
     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
                              Regs64bit ? X86::RDX : X86::EDX,
                              cpInH, cpInL.getValue(1));
     SDValue swapInL, swapInH;
     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
                           DAG.getConstant(0, dl, HalfT));
     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
                           DAG.getConstant(1, dl, HalfT));
     swapInH =
         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
                          swapInH, cpInH.getValue(1));
     // If the current function needs the base pointer, RBX,
     // we shouldn't use cmpxchg directly.
     // Indeed the lowering of that instruction will clobber
     // that register and since RBX will be a reserved register
     // the register allocator will not make sure its value will
     // be properly saved and restored around this live-range.
     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
     SDValue Result;
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     unsigned BasePtr = TRI->getBaseRegister();
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
       // ISel prefers the LCMPXCHG64 variant.
       // If that assert breaks, that means it is not the case anymore,
       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
       // not just EBX. This is a matter of accepting i64 input for that
       // pseudo, and restoring into the register of the right wide
       // in expand pseudo. Everything else should just work.
       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
              "Saving only half of the RBX");
       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
                                            Regs64bit ? X86::RBX : X86::EBX,
                                            HalfT, swapInH.getValue(1));
       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
                        RBXSave,
                        /*Glue*/ RBXSave.getValue(2)};
       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
     } else {
       unsigned Opcode =
           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
                                  swapInH.getValue(1));
       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
                        swapInL.getValue(1)};
       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
     }
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
                                         Regs64bit ? X86::RAX : X86::EAX,
                                         HalfT, Result.getValue(1));
     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
                                         Regs64bit ? X86::RDX : X86::EDX,
                                         HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
 
     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
                                         MVT::i32, cpOutH.getValue(2));
     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
 
     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
     Results.push_back(Success);
     Results.push_back(EFLAGS.getValue(1));
     return;
   }
   case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case ISD::ATOMIC_LOAD: {
     // Delegate to generic TypeLegalization. Situations we can really handle
     // should have already been dealt with by AtomicExpandPass.cpp.
     break;
   }
   case ISD::BITCAST: {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     EVT DstVT = N->getValueType(0);
     EVT SrcVT = N->getOperand(0).getValueType();
 
     if (SrcVT != MVT::f64 ||
         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
       return;
 
     unsigned NumElts = DstVT.getVectorNumElements();
     EVT SVT = DstVT.getVectorElementType();
     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                    MVT::v2f64, N->getOperand(0));
     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
 
     if (ExperimentalVectorWideningLegalization) {
       // If we are legalizing vectors by widening, we already have the desired
       // legal vector type, just return it.
       Results.push_back(ToVecInt);
       return;
     }
 
     SmallVector<SDValue, 8> Elts;
     for (unsigned i = 0, e = NumElts; i != e; ++i)
       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
 
     Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
     return;
   }
   case ISD::MGATHER: {
     EVT VT = N->getValueType(0);
     if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
       auto *Gather = cast<MaskedGatherSDNode>(N);
       SDValue Index = Gather->getIndex();
       if (Index.getValueType() != MVT::v2i64)
         return;
       SDValue Mask = Gather->getMask();
       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
       SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                                  Gather->getValue(),
                                  DAG.getUNDEF(MVT::v2f32));
       if (!Subtarget.hasVLX()) {
         // We need to widen the mask, but the instruction will only use 2
         // of its elements. So we can use undef.
         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
                            DAG.getUNDEF(MVT::v2i1));
         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
       }
       SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
                         Index };
       SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
         DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
         Gather->getMemoryVT(), Gather->getMemOperand());
       Results.push_back(Res);
       Results.push_back(Res.getValue(2));
       return;
     }
     if (VT == MVT::v2i32) {
       auto *Gather = cast<MaskedGatherSDNode>(N);
       SDValue Index = Gather->getIndex();
       SDValue Mask = Gather->getMask();
       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
       SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
                                  Gather->getValue(),
                                  DAG.getUNDEF(MVT::v2i32));
       // If the index is v2i64 we can use it directly.
       if (Index.getValueType() == MVT::v2i64 &&
           (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
         if (!Subtarget.hasVLX()) {
           // We need to widen the mask, but the instruction will only use 2
           // of its elements. So we can use undef.
           Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
                              DAG.getUNDEF(MVT::v2i1));
           Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
         }
         SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
                           Index };
         SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
           DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
           Gather->getMemoryVT(), Gather->getMemOperand());
         SDValue Chain = Res.getValue(2);
         if (!ExperimentalVectorWideningLegalization)
           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
                             DAG.getIntPtrConstant(0, dl));
         Results.push_back(Res);
         Results.push_back(Chain);
         return;
       }
       EVT IndexVT = Index.getValueType();
       EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
                                         IndexVT.getScalarType(), 4);
       // Otherwise we need to custom widen everything to avoid promotion.
       Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
                           DAG.getUNDEF(IndexVT));
       Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
                          DAG.getConstant(0, dl, MVT::v2i1));
       SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
                         Index };
       SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
                                         Gather->getMemoryVT(), dl, Ops,
                                         Gather->getMemOperand());
       SDValue Chain = Res.getValue(1);
       if (!ExperimentalVectorWideningLegalization)
         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
                           DAG.getIntPtrConstant(0, dl));
       Results.push_back(Res);
       Results.push_back(Chain);
       return;
     }
     break;
   }
   }
 }
 
 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((X86ISD::NodeType)Opcode) {
   case X86ISD::FIRST_NUMBER:       break;
   case X86ISD::BSF:                return "X86ISD::BSF";
   case X86ISD::BSR:                return "X86ISD::BSR";
   case X86ISD::SHLD:               return "X86ISD::SHLD";
   case X86ISD::SHRD:               return "X86ISD::SHRD";
   case X86ISD::FAND:               return "X86ISD::FAND";
   case X86ISD::FANDN:              return "X86ISD::FANDN";
   case X86ISD::FOR:                return "X86ISD::FOR";
   case X86ISD::FXOR:               return "X86ISD::FXOR";
   case X86ISD::FILD:               return "X86ISD::FILD";
   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
   case X86ISD::FLD:                return "X86ISD::FLD";
   case X86ISD::FST:                return "X86ISD::FST";
   case X86ISD::CALL:               return "X86ISD::CALL";
   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
   case X86ISD::BT:                 return "X86ISD::BT";
   case X86ISD::CMP:                return "X86ISD::CMP";
   case X86ISD::COMI:               return "X86ISD::COMI";
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   case X86ISD::CMPM:               return "X86ISD::CMPM";
   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
   case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
   case X86ISD::FSETCCM_RND:        return "X86ISD::FSETCCM_RND";
   case X86ISD::CMOV:               return "X86ISD::CMOV";
   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
   case X86ISD::IRET:               return "X86ISD::IRET";
   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   case X86ISD::HADD:               return "X86ISD::HADD";
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMAXS:              return "X86ISD::FMAXS";
   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
   case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
   case X86ISD::FMINS:              return "X86ISD::FMINS";
   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
   case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
   case X86ISD::LADD:               return "X86ISD::LADD";
   case X86ISD::LSUB:               return "X86ISD::LSUB";
   case X86ISD::LOR:                return "X86ISD::LOR";
   case X86ISD::LXOR:               return "X86ISD::LXOR";
   case X86ISD::LAND:               return "X86ISD::LAND";
   case X86ISD::LINC:               return "X86ISD::LINC";
   case X86ISD::LDEC:               return "X86ISD::LDEC";
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
   case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
   case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
   case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
   case X86ISD::VSRL:               return "X86ISD::VSRL";
   case X86ISD::VSRA:               return "X86ISD::VSRA";
   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   case X86ISD::PHMINPOS:           return "X86ISD::PHMINPOS";
   case X86ISD::ADD:                return "X86ISD::ADD";
   case X86ISD::SUB:                return "X86ISD::SUB";
   case X86ISD::ADC:                return "X86ISD::ADC";
   case X86ISD::SBB:                return "X86ISD::SBB";
   case X86ISD::SMUL:               return "X86ISD::SMUL";
   case X86ISD::UMUL:               return "X86ISD::UMUL";
   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
   case X86ISD::INC:                return "X86ISD::INC";
   case X86ISD::DEC:                return "X86ISD::DEC";
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
   case X86ISD::TESTM:              return "X86ISD::TESTM";
   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   case X86ISD::KTEST:              return "X86ISD::KTEST";
   case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
   case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
   case X86ISD::VSHLD:              return "X86ISD::VSHLD";
   case X86ISD::VSHRD:              return "X86ISD::VSHRD";
   case X86ISD::VSHLDV:             return "X86ISD::VSHLDV";
   case X86ISD::VSHRDV:             return "X86ISD::VSHRDV";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
   case X86ISD::VFIXUPIMMS:         return "X86ISD::VFIXUPIMMS";
   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
   case X86ISD::VRANGE_RND:         return "X86ISD::VRANGE_RND";
   case X86ISD::VRANGES:            return "X86ISD::VRANGES";
   case X86ISD::VRANGES_RND:        return "X86ISD::VRANGES_RND";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   case X86ISD::SAHF:               return "X86ISD::SAHF";
   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
   case X86ISD::FMADDS1:            return "X86ISD::FMADDS1";
   case X86ISD::FNMADDS1:           return "X86ISD::FNMADDS1";
   case X86ISD::FMSUBS1:            return "X86ISD::FMSUBS1";
   case X86ISD::FNMSUBS1:           return "X86ISD::FNMSUBS1";
   case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
   case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
   case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
   case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
   case X86ISD::FMADDS3:            return "X86ISD::FMADDS3";
   case X86ISD::FNMADDS3:           return "X86ISD::FNMADDS3";
   case X86ISD::FMSUBS3:            return "X86ISD::FMSUBS3";
   case X86ISD::FNMSUBS3:           return "X86ISD::FNMSUBS3";
   case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
   case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
   case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
   case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND";
   case X86ISD::FMADD4S:            return "X86ISD::FMADD4S";
   case X86ISD::FNMADD4S:           return "X86ISD::FNMADD4S";
   case X86ISD::FMSUB4S:            return "X86ISD::FMSUB4S";
   case X86ISD::FNMSUB4S:           return "X86ISD::FNMSUB4S";
   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
   case X86ISD::VRNDSCALE_RND:      return "X86ISD::VRNDSCALE_RND";
   case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
   case X86ISD::VRNDSCALES_RND:     return "X86ISD::VRNDSCALES_RND";
   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
   case X86ISD::VREDUCE_RND:        return "X86ISD::VREDUCE_RND";
   case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
   case X86ISD::VREDUCES_RND:       return "X86ISD::VREDUCES_RND";
   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
   case X86ISD::VGETMANT_RND:       return "X86ISD::VGETMANT_RND";
   case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
   case X86ISD::VGETMANTS_RND:      return "X86ISD::VGETMANTS_RND";
   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   case X86ISD::XTEST:              return "X86ISD::XTEST";
   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
   case X86ISD::SELECT:             return "X86ISD::SELECT";
   case X86ISD::SELECTS:            return "X86ISD::SELECTS";
   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
   case X86ISD::RCP14:              return "X86ISD::RCP14";
   case X86ISD::RCP14S:             return "X86ISD::RCP14S";
   case X86ISD::RCP28:              return "X86ISD::RCP28";
   case X86ISD::RCP28S:             return "X86ISD::RCP28S";
   case X86ISD::EXP2:               return "X86ISD::EXP2";
   case X86ISD::RSQRT14:            return "X86ISD::RSQRT14";
   case X86ISD::RSQRT14S:           return "X86ISD::RSQRT14S";
   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
   case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
   case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
   case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
   case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
   case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
   case X86ISD::ADDS:               return "X86ISD::ADDS";
   case X86ISD::SUBS:               return "X86ISD::SUBS";
   case X86ISD::AVG:                return "X86ISD::AVG";
   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
   case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
   case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
   case X86ISD::CVTTP2SI_RND:       return "X86ISD::CVTTP2SI_RND";
   case X86ISD::CVTTP2UI_RND:       return "X86ISD::CVTTP2UI_RND";
   case X86ISD::CVTTS2SI_RND:       return "X86ISD::CVTTS2SI_RND";
   case X86ISD::CVTTS2UI_RND:       return "X86ISD::CVTTS2UI_RND";
   case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
   case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
   case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
   case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
   case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
   case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
   case X86ISD::CVTPH2PS_RND:       return "X86ISD::CVTPH2PS_RND";
   case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
   case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
   case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
   case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
   case X86ISD::LWPINS:             return "X86ISD::LWPINS";
   case X86ISD::MGATHER:            return "X86ISD::MGATHER";
   case X86ISD::MSCATTER:           return "X86ISD::MSCATTER";
   case X86ISD::VPDPBUSD:           return "X86ISD::VPDPBUSD";
   case X86ISD::VPDPBUSDS:          return "X86ISD::VPDPBUSDS";
   case X86ISD::VPDPWSSD:           return "X86ISD::VPDPWSSD";
   case X86ISD::VPDPWSSDS:          return "X86ISD::VPDPWSSDS";
   case X86ISD::VPSHUFBITQMB:       return "X86ISD::VPSHUFBITQMB";
   case X86ISD::GF2P8MULB:          return "X86ISD::GF2P8MULB";
   case X86ISD::GF2P8AFFINEQB:      return "X86ISD::GF2P8AFFINEQB";
   case X86ISD::GF2P8AFFINEINVQB:   return "X86ISD::GF2P8AFFINEINVQB";
   }
   return nullptr;
 }
 
 /// Return true if the addressing mode represented by AM is legal for this
 /// target, for a load/store of the specified type.
 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
                                               unsigned AS,
                                               Instruction *I) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
 
   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
     return false;
 
   if (AM.BaseGV) {
     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
 
     // If a reference to this global requires an extra load, we can't fold it.
     if (isGlobalStubReference(GVFlags))
       return false;
 
     // If BaseGV requires a register for the PIC base, we cannot also have a
     // BaseReg specified.
     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
       return false;
 
     // If lower 4G is not available, then we must use rip-relative addressing.
     if ((M != CodeModel::Small || isPositionIndependent()) &&
         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
       return false;
   }
 
   switch (AM.Scale) {
   case 0:
   case 1:
   case 2:
   case 4:
   case 8:
     // These scales always work.
     break;
   case 3:
   case 5:
   case 9:
     // These scales are formed with basereg+scalereg.  Only accept if there is
     // no basereg yet.
     if (AM.HasBaseReg)
       return false;
     break;
   default:  // Other stuff never works.
     return false;
   }
 
   return true;
 }
 
 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   unsigned Bits = Ty->getScalarSizeInBits();
 
   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
   // particularly cheaper than those without.
   if (Bits == 8)
     return false;
 
   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
   // shifts just as cheap as scalar ones.
   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
     return false;
 
   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   // fully general vector.
   return true;
 }
 
 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   return NumBits1 > NumBits2;
 }
 
 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
 
   if (!isTypeLegal(EVT::getEVT(Ty1)))
     return false;
 
   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
 
   // Assuming the caller doesn't have a zeroext or signext return parameter,
   // truncation all the way down to i1 is valid.
   return true;
 }
 
 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return isInt<32>(Imm);
 }
 
 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   // Can also use sub to handle negated immediates.
   return isInt<32>(Imm);
 }
 
 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
   return NumBits1 > NumBits2;
 }
 
 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
 }
 
 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
 }
 
 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   EVT VT1 = Val.getValueType();
   if (isZExtFree(VT1, VT2))
     return true;
 
   if (Val.getOpcode() != ISD::LOAD)
     return false;
 
   if (!VT1.isSimple() || !VT1.isInteger() ||
       !VT2.isSimple() || !VT2.isInteger())
     return false;
 
   switch (VT1.getSimpleVT().SimpleTy) {
   default: break;
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
     // X86 has 8, 16, and 32-bit zero-extending loads.
     return true;
   }
 
   return false;
 }
 
 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
 
 bool
 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   if (!Subtarget.hasAnyFMA())
     return false;
 
   VT = VT.getScalarType();
 
   if (!VT.isSimple())
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
   case MVT::f64:
     return true;
   default:
     break;
   }
 
   return false;
 }
 
 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
 }
 
 /// Targets can use this to indicate that they only support *some*
 /// VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
 /// are assumed to be legal.
 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   if (!VT.isSimple())
     return false;
 
   // Not for i1 vectors
   if (VT.getSimpleVT().getScalarType() == MVT::i1)
     return false;
 
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSimpleVT().getSizeInBits() == 64)
     return false;
 
   // We only care that the types being shuffled are legal. The lowering can
   // handle any possible shuffle mask that results.
   return isTypeLegal(VT.getSimpleVT());
 }
 
 bool
 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
                                           EVT VT) const {
   // Just delegate to the generic legality, clear masks aren't special.
   return isShuffleMaskLegal(Mask, VT);
 }
 
 //===----------------------------------------------------------------------===//
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
 /// Utility function to emit xbegin specifying the start of an RTM region.
 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
                                      const TargetInstrInfo *TII) {
   DebugLoc DL = MI.getDebugLoc();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
 
   // For the v = xbegin(), we generate
   //
   // thisMBB:
   //  xbegin sinkMBB
   //
   // mainMBB:
   //  s0 = -1
   //
   // fallBB:
   //  eax = # XABORT_DEF
   //  s1 = eax
   //
   // sinkMBB:
   //  v = phi(s0/mainBB, s1/fallBB)
 
   MachineBasicBlock *thisMBB = MBB;
   MachineFunction *MF = MBB->getParent();
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, mainMBB);
   MF->insert(I, fallMBB);
   MF->insert(I, sinkMBB);
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned fallDstReg = MRI.createVirtualRegister(RC);
 
   // thisMBB:
   //  xbegin fallMBB
   //  # fallthrough to mainMBB
   //  # abortion to fallMBB
   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(fallMBB);
 
   // mainMBB:
   //  mainDstReg := -1
   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   mainMBB->addSuccessor(sinkMBB);
 
   // fallMBB:
   //  ; pseudo instruction to model hardware's definition from XABORT
   //  EAX := XABORT_DEF
   //  fallDstReg := EAX
   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
       .addReg(X86::EAX);
   fallMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
       .addReg(mainDstReg).addMBB(mainMBB)
       .addReg(fallDstReg).addMBB(fallMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
 }
 
 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
 // or XMM0_V32I8 in AVX all of this code can be replaced with that
 // in the .td file.
 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
                                        const TargetInstrInfo *TII) {
   unsigned Opc;
   switch (MI.getOpcode()) {
   default: llvm_unreachable("illegal opcode!");
   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
   }
 
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
 
   unsigned NumArgs = MI.getNumOperands();
   for (unsigned i = 1; i < NumArgs; ++i) {
     MachineOperand &Op = MI.getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.add(Op);
   }
   if (MI.hasOneMemOperand())
     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
       .addReg(X86::XMM0);
 
   MI.eraseFromParent();
   return BB;
 }
 
 // FIXME: Custom handling because TableGen doesn't support multiple implicit
 // defs in an instruction pattern
 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
                                        const TargetInstrInfo *TII) {
   unsigned Opc;
   switch (MI.getOpcode()) {
   default: llvm_unreachable("illegal opcode!");
   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   }
 
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
 
   unsigned NumArgs = MI.getNumOperands(); // remove the results
   for (unsigned i = 1; i < NumArgs; ++i) {
     MachineOperand &Op = MI.getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.add(Op);
   }
   if (MI.hasOneMemOperand())
     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
 
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
       .addReg(X86::ECX);
 
   MI.eraseFromParent();
   return BB;
 }
 
 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
                                      const X86Subtarget &Subtarget) {
   DebugLoc dl = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   // insert input VAL into EAX
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
       .addReg(MI.getOperand(0).getReg());
   // insert zero to ECX
   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
 
   // insert zero to EDX
   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
 
   // insert WRPKRU instruction
   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
 
   MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
                                      const X86Subtarget &Subtarget) {
   DebugLoc dl = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   // insert zero to ECX
   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
 
   // insert RDPKRU instruction
   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
       .addReg(X86::EAX);
 
   MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
                                       const X86Subtarget &Subtarget,
                                       unsigned Opc) {
   DebugLoc dl = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   // Address into RAX/EAX, other two args into ECX, EDX.
   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   for (int i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(MI.getOperand(i));
 
   unsigned ValOps = X86::AddrNumOperands;
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
       .addReg(MI.getOperand(ValOps).getReg());
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
       .addReg(MI.getOperand(ValOps + 1).getReg());
 
   // The instruction doesn't actually take any operands though.
   BuildMI(*BB, MI, dl, TII->get(Opc));
 
   MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
                                       const X86Subtarget &Subtarget) {
   DebugLoc dl = MI->getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   // Address into RAX/EAX
   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   for (int i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(MI->getOperand(i));
 
   // The instruction doesn't actually take any operands though.
   BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
 
   MI->eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 
 
 MachineBasicBlock *
 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
   // Emit va_arg instruction on X86-64.
 
   // Operands to this pseudo-instruction:
   // 0  ) Output        : destination address (reg)
   // 1-5) Input         : va_list address (addr, i64mem)
   // 6  ) ArgSize       : Size (in bytes) of vararg type
   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
   // 8  ) Align         : Alignment of type
   // 9  ) EFLAGS (implicit-def)
 
   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   static_assert(X86::AddrNumOperands == 5,
                 "VAARG_64 assumes 5 address operands");
 
   unsigned DestReg = MI.getOperand(0).getReg();
   MachineOperand &Base = MI.getOperand(1);
   MachineOperand &Scale = MI.getOperand(2);
   MachineOperand &Index = MI.getOperand(3);
   MachineOperand &Disp = MI.getOperand(4);
   MachineOperand &Segment = MI.getOperand(5);
   unsigned ArgSize = MI.getOperand(6).getImm();
   unsigned ArgMode = MI.getOperand(7).getImm();
   unsigned Align = MI.getOperand(8).getImm();
 
   // Memory Reference
   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   // Machine Information
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   DebugLoc DL = MI.getDebugLoc();
 
   // struct va_list {
   //   i32   gp_offset
   //   i32   fp_offset
   //   i64   overflow_area (address)
   //   i64   reg_save_area (address)
   // }
   // sizeof(va_list) = 24
   // alignment(va_list) = 8
 
   unsigned TotalNumIntRegs = 6;
   unsigned TotalNumXMMRegs = 8;
   bool UseGPOffset = (ArgMode == 1);
   bool UseFPOffset = (ArgMode == 2);
   unsigned MaxOffset = TotalNumIntRegs * 8 +
                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
 
   /* Align ArgSize to a multiple of 8 */
   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
   bool NeedsAlign = (Align > 8);
 
   MachineBasicBlock *thisMBB = MBB;
   MachineBasicBlock *overflowMBB;
   MachineBasicBlock *offsetMBB;
   MachineBasicBlock *endMBB;
 
   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   unsigned OffsetReg = 0;
 
   if (!UseGPOffset && !UseFPOffset) {
     // If we only pull from the overflow region, we don't create a branch.
     // We don't need to alter control flow.
     OffsetDestReg = 0; // unused
     OverflowDestReg = DestReg;
 
     offsetMBB = nullptr;
     overflowMBB = thisMBB;
     endMBB = thisMBB;
   } else {
     // First emit code to check if gp_offset (or fp_offset) is below the bound.
     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
     // If not, pull from overflow_area. (branch to overflowMBB)
     //
     //       thisMBB
     //         |     .
     //         |        .
     //     offsetMBB   overflowMBB
     //         |        .
     //         |     .
     //        endMBB
 
     // Registers for the PHI in endMBB
     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
 
     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
     MachineFunction *MF = MBB->getParent();
     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
     MachineFunction::iterator MBBIter = ++MBB->getIterator();
 
     // Insert the new basic blocks
     MF->insert(MBBIter, offsetMBB);
     MF->insert(MBBIter, overflowMBB);
     MF->insert(MBBIter, endMBB);
 
     // Transfer the remainder of MBB and its successor edges to endMBB.
     endMBB->splice(endMBB->begin(), thisMBB,
                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
 
     // Make offsetMBB and overflowMBB successors of thisMBB
     thisMBB->addSuccessor(offsetMBB);
     thisMBB->addSuccessor(overflowMBB);
 
     // endMBB is a successor of both offsetMBB and overflowMBB
     offsetMBB->addSuccessor(endMBB);
     overflowMBB->addSuccessor(endMBB);
 
     // Load the offset value into a register
     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
         .add(Base)
         .add(Scale)
         .add(Index)
         .addDisp(Disp, UseFPOffset ? 4 : 0)
         .add(Segment)
         .setMemRefs(MMOBegin, MMOEnd);
 
     // Check if there is enough room left to pull this argument.
     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
       .addReg(OffsetReg)
       .addImm(MaxOffset + 8 - ArgSizeA8);
 
     // Branch to "overflowMBB" if offset >= max
     // Fall through to "offsetMBB" otherwise
     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
       .addMBB(overflowMBB);
   }
 
   // In offsetMBB, emit code to use the reg_save_area.
   if (offsetMBB) {
     assert(OffsetReg != 0);
 
     // Read the reg_save_area address.
     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
         .add(Base)
         .add(Scale)
         .add(Index)
         .addDisp(Disp, 16)
         .add(Segment)
         .setMemRefs(MMOBegin, MMOEnd);
 
     // Zero-extend the offset
     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
         .addImm(0)
         .addReg(OffsetReg)
         .addImm(X86::sub_32bit);
 
     // Add the offset to the reg_save_area to get the final address.
     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
       .addReg(OffsetReg64)
       .addReg(RegSaveReg);
 
     // Compute the offset for the next argument
     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
       .addReg(OffsetReg)
       .addImm(UseFPOffset ? 16 : 8);
 
     // Store it back into the va_list.
     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
         .add(Base)
         .add(Scale)
         .add(Index)
         .addDisp(Disp, UseFPOffset ? 4 : 0)
         .add(Segment)
         .addReg(NextOffsetReg)
         .setMemRefs(MMOBegin, MMOEnd);
 
     // Jump to endMBB
     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
       .addMBB(endMBB);
   }
 
   //
   // Emit code to use overflow area
   //
 
   // Load the overflow_area address into a register.
   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
       .add(Base)
       .add(Scale)
       .add(Index)
       .addDisp(Disp, 8)
       .add(Segment)
       .setMemRefs(MMOBegin, MMOEnd);
 
   // If we need to align it, do so. Otherwise, just copy the address
   // to OverflowDestReg.
   if (NeedsAlign) {
     // Align the overflow address
     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
 
     // aligned_addr = (addr + (align-1)) & ~(align-1)
     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
       .addReg(OverflowAddrReg)
       .addImm(Align-1);
 
     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
       .addReg(TmpReg)
       .addImm(~(uint64_t)(Align-1));
   } else {
     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
       .addReg(OverflowAddrReg);
   }
 
   // Compute the next overflow address after this argument.
   // (the overflow address should be kept 8-byte aligned)
   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
     .addReg(OverflowDestReg)
     .addImm(ArgSizeA8);
 
   // Store the new overflow address.
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
       .add(Base)
       .add(Scale)
       .add(Index)
       .addDisp(Disp, 8)
       .add(Segment)
       .addReg(NextAddrReg)
       .setMemRefs(MMOBegin, MMOEnd);
 
   // If we branched, emit the PHI to the front of endMBB.
   if (offsetMBB) {
     BuildMI(*endMBB, endMBB->begin(), DL,
             TII->get(X86::PHI), DestReg)
       .addReg(OffsetDestReg).addMBB(offsetMBB)
       .addReg(OverflowDestReg).addMBB(overflowMBB);
   }
 
   // Erase the pseudo instruction
   MI.eraseFromParent();
 
   return endMBB;
 }
 
 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *MBB) const {
   // Emit code to save XMM registers to the stack. The ABI says that the
   // number of registers to save is given in %al, so it's theoretically
   // possible to do an indirect jump trick to avoid saving all of them,
   // however this code takes a simpler approach and just executes all
   // of the stores if %al is non-zero. It's less code, and it's probably
   // easier on the hardware branch predictor, and stores aren't all that
   // expensive anyway.
 
   // Create the new basic blocks. One block contains all the XMM stores,
   // and one block is the final destination regardless of whether any
   // stores were performed.
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction *F = MBB->getParent();
   MachineFunction::iterator MBBIter = ++MBB->getIterator();
   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(MBBIter, XMMSaveMBB);
   F->insert(MBBIter, EndMBB);
 
   // Transfer the remainder of MBB and its successor edges to EndMBB.
   EndMBB->splice(EndMBB->begin(), MBB,
                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // The original block will now fall through to the XMM save block.
   MBB->addSuccessor(XMMSaveMBB);
   // The XMMSaveMBB will fall through to the end block.
   XMMSaveMBB->addSuccessor(EndMBB);
 
   // Now add the instructions.
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned CountReg = MI.getOperand(0).getReg();
   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
 
   if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
     // If %al is 0, branch around the XMM save block.
     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
     MBB->addSuccessor(EndMBB);
   }
 
   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   // that was just emitted, but clearly shouldn't be "saved".
   assert((MI.getNumOperands() <= 3 ||
           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
          "Expected last argument to be EFLAGS");
   unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   // In the XMM save block, save all the XMM argument registers.
   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
     MachineMemOperand *MMO = F->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
         MachineMemOperand::MOStore,
         /*Size=*/16, /*Align=*/16);
     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
         .addFrameIndex(RegSaveFrameIndex)
         .addImm(/*Scale=*/1)
         .addReg(/*IndexReg=*/0)
         .addImm(/*Disp=*/Offset)
         .addReg(/*Segment=*/0)
         .addReg(MI.getOperand(i).getReg())
         .addMemOperand(MMO);
   }
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
 
   return EndMBB;
 }
 
 // The EFLAGS operand of SelectItr might be missing a kill marker
 // because there were multiple uses of EFLAGS, and ISel didn't know
 // which to mark. Figure out whether SelectItr should have had a
 // kill marker, and set it if it should. Returns the correct kill
 // marker value.
 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
                                      MachineBasicBlock* BB,
                                      const TargetRegisterInfo* TRI) {
   // Scan forward through BB for a use/def of EFLAGS.
   MachineBasicBlock::iterator miI(std::next(SelectItr));
   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
     const MachineInstr& mi = *miI;
     if (mi.readsRegister(X86::EFLAGS))
       return false;
     if (mi.definesRegister(X86::EFLAGS))
       break; // Should have kill-flag - update below.
   }
 
   // If we hit the end of the block, check whether EFLAGS is live into a
   // successor.
   if (miI == BB->end()) {
     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
                                           sEnd = BB->succ_end();
          sItr != sEnd; ++sItr) {
       MachineBasicBlock* succ = *sItr;
       if (succ->isLiveIn(X86::EFLAGS))
         return false;
     }
   }
 
   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   // out. SelectMI should have a kill flag on EFLAGS.
   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
   return true;
 }
 
 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
 // together with other CMOV pseudo-opcodes into a single basic-block with
 // conditional jump around it.
 static bool isCMOVPseudo(MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
   case X86::CMOV_GR8:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
   case X86::CMOV_RFP64:
   case X86::CMOV_RFP80:
   case X86::CMOV_V2F64:
   case X86::CMOV_V2I64:
   case X86::CMOV_V4F32:
   case X86::CMOV_V4F64:
   case X86::CMOV_V4I64:
   case X86::CMOV_V16F32:
   case X86::CMOV_V8F32:
   case X86::CMOV_V8F64:
   case X86::CMOV_V8I64:
   case X86::CMOV_V8I1:
   case X86::CMOV_V16I1:
   case X86::CMOV_V32I1:
   case X86::CMOV_V64I1:
     return true;
 
   default:
     return false;
   }
 }
 
 // Helper function, which inserts PHI functions into SinkMBB:
 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
 // the last PHI function inserted.
 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
     MachineBasicBlock *SinkMBB) {
   MachineFunction *MF = TrueMBB->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   DebugLoc DL = MIItBegin->getDebugLoc();
 
   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
 
   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
 
   // As we are creating the PHIs, we have to be careful if there is more than
   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
   // PHIs have to reference the individual true/false inputs from earlier PHIs.
   // That also means that PHI construction must work forward from earlier to
   // later, and that the code must maintain a mapping from earlier PHI's
   // destination registers, and the registers that went into the PHI.
   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
   MachineInstrBuilder MIB;
 
   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
     unsigned DestReg = MIIt->getOperand(0).getReg();
     unsigned Op1Reg = MIIt->getOperand(1).getReg();
     unsigned Op2Reg = MIIt->getOperand(2).getReg();
 
     // If this CMOV we are generating is the opposite condition from
     // the jump we generated, then we have to swap the operands for the
     // PHI that is going to be generated.
     if (MIIt->getOperand(3).getImm() == OppCC)
       std::swap(Op1Reg, Op2Reg);
 
     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
       Op1Reg = RegRewriteTable[Op1Reg].first;
 
     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
       Op2Reg = RegRewriteTable[Op2Reg].second;
 
     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
               .addReg(Op1Reg)
               .addMBB(FalseMBB)
               .addReg(Op2Reg)
               .addMBB(TrueMBB);
 
     // Add this PHI to the rewrite table.
     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
   }
 
   return MIB;
 }
 
 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
                                              MachineInstr &SecondCascadedCMOV,
                                              MachineBasicBlock *ThisMBB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = FirstCMOV.getDebugLoc();
 
   // We lower cascaded CMOVs such as
   //
   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
   //
   // to two successive branches.
   //
   // Without this, we would add a PHI between the two jumps, which ends up
   // creating a few copies all around. For instance, for
   //
   //    (sitofp (zext (fcmp une)))
   //
   // we would generate:
   //
   //         ucomiss %xmm1, %xmm0
   //         movss  <1.0f>, %xmm0
   //         movaps  %xmm0, %xmm1
   //         jne     .LBB5_2
   //         xorps   %xmm1, %xmm1
   // .LBB5_2:
   //         jp      .LBB5_4
   //         movaps  %xmm1, %xmm0
   // .LBB5_4:
   //         retq
   //
   // because this custom-inserter would have generated:
   //
   //   A
   //   | \
   //   |  B
   //   | /
   //   C
   //   | \
   //   |  D
   //   | /
   //   E
   //
   // A: X = ...; Y = ...
   // B: empty
   // C: Z = PHI [X, A], [Y, B]
   // D: empty
   // E: PHI [X, C], [Z, D]
   //
   // If we lower both CMOVs in a single step, we can instead generate:
   //
   //   A
   //   | \
   //   |  C
   //   | /|
   //   |/ |
   //   |  |
   //   |  D
   //   | /
   //   E
   //
   // A: X = ...; Y = ...
   // D: empty
   // E: PHI [X, A], [X, C], [Y, D]
   //
   // Which, in our sitofp/fcmp example, gives us something like:
   //
   //         ucomiss %xmm1, %xmm0
   //         movss  <1.0f>, %xmm0
   //         jne     .LBB5_4
   //         jp      .LBB5_4
   //         xorps   %xmm0, %xmm0
   // .LBB5_4:
   //         retq
   //
 
   // We lower cascaded CMOV into two successive branches to the same block.
   // EFLAGS is used by both, so mark it as live in the second.
   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
   MachineFunction *F = ThisMBB->getParent();
   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
 
   MachineFunction::iterator It = ++ThisMBB->getIterator();
   F->insert(It, FirstInsertedMBB);
   F->insert(It, SecondInsertedMBB);
   F->insert(It, SinkMBB);
 
   // For a cascaded CMOV, we lower it to two successive branches to
   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
   // the FirstInsertedMBB.
   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
     SinkMBB->addLiveIn(X86::EFLAGS);
   }
 
   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
                   ThisMBB->end());
   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
 
   // Fallthrough block for ThisMBB.
   ThisMBB->addSuccessor(FirstInsertedMBB);
   // The true block target of the first branch is always SinkMBB.
   ThisMBB->addSuccessor(SinkMBB);
   // Fallthrough block for FirstInsertedMBB.
   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
   // The true block for the branch of FirstInsertedMBB.
   FirstInsertedMBB->addSuccessor(SinkMBB);
   // This is fallthrough.
   SecondInsertedMBB->addSuccessor(SinkMBB);
 
   // Create the conditional branch instructions.
   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
   unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
   BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
 
   X86::CondCode SecondCC =
       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
   unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
   BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
 
   //  SinkMBB:
   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
   unsigned DestReg = FirstCMOV.getOperand(0).getReg();
   unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
   unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
   MachineInstrBuilder MIB =
       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
           .addReg(Op1Reg)
           .addMBB(SecondInsertedMBB)
           .addReg(Op2Reg)
           .addMBB(ThisMBB);
 
   // The second SecondInsertedMBB provides the same incoming value as the
   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
   // Copy the PHI result to the register defined by the second CMOV.
   BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
           TII->get(TargetOpcode::COPY),
           SecondCascadedCMOV.getOperand(0).getReg())
       .addReg(FirstCMOV.getOperand(0).getReg());
 
   // Now remove the CMOVs.
   FirstCMOV.eraseFromParent();
   SecondCascadedCMOV.eraseFromParent();
 
   return SinkMBB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
                                      MachineBasicBlock *ThisMBB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
   // destination vreg to set, the condition code register to branch on, the
   // true/false values to select between and a branch opcode to use.
 
   //  ThisMBB:
   //  ...
   //   TrueVal = ...
   //   cmpTY ccX, r1, r2
   //   bCC copy1MBB
   //   fallthrough --> FalseMBB
 
   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
   // as described above, by inserting a BB, and then making a PHI at the join
   // point to select the true and false operands of the CMOV in the PHI.
   //
   // The code also handles two different cases of multiple CMOV opcodes
   // in a row.
   //
   // Case 1:
   // In this case, there are multiple CMOVs in a row, all which are based on
   // the same condition setting (or the exact opposite condition setting).
   // In this case we can lower all the CMOVs using a single inserted BB, and
   // then make a number of PHIs at the join point to model the CMOVs. The only
   // trickiness here, is that in a case like:
   //
   // t2 = CMOV cond1 t1, f1
   // t3 = CMOV cond1 t2, f2
   //
   // when rewriting this into PHIs, we have to perform some renaming on the
   // temps since you cannot have a PHI operand refer to a PHI result earlier
   // in the same block.  The "simple" but wrong lowering would be:
   //
   // t2 = PHI t1(BB1), f1(BB2)
   // t3 = PHI t2(BB1), f2(BB2)
   //
   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
   // renaming is to note that on the path through BB1, t2 is really just a
   // copy of t1, and do that renaming, properly generating:
   //
   // t2 = PHI t1(BB1), f1(BB2)
   // t3 = PHI t1(BB1), f2(BB2)
   //
   // Case 2:
   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
   // function - EmitLoweredCascadedSelect.
 
   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
   MachineInstr *LastCMOV = &MI;
   MachineBasicBlock::iterator NextMIIt =
       std::next(MachineBasicBlock::iterator(MI));
 
   // Check for case 1, where there are multiple CMOVs with the same condition
   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
   // number of jumps the most.
 
   if (isCMOVPseudo(MI)) {
     // See if we have a string of CMOVS with the same condition.
     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
            (NextMIIt->getOperand(3).getImm() == CC ||
             NextMIIt->getOperand(3).getImm() == OppCC)) {
       LastCMOV = &*NextMIIt;
       ++NextMIIt;
     }
   }
 
   // This checks for case 2, but only do this if we didn't already find
   // case 1, as indicated by LastCMOV == MI.
   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
       NextMIIt->getOpcode() == MI.getOpcode() &&
       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
       NextMIIt->getOperand(1).isKill()) {
     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
   }
 
   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
   MachineFunction *F = ThisMBB->getParent();
   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
 
   MachineFunction::iterator It = ++ThisMBB->getIterator();
   F->insert(It, FalseMBB);
   F->insert(It, SinkMBB);
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
     FalseMBB->addLiveIn(X86::EFLAGS);
     SinkMBB->addLiveIn(X86::EFLAGS);
   }
 
   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
                   std::next(MachineBasicBlock::iterator(LastCMOV)),
                   ThisMBB->end());
   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
 
   // Fallthrough block for ThisMBB.
   ThisMBB->addSuccessor(FalseMBB);
   // The true block target of the first (or only) branch is always a SinkMBB.
   ThisMBB->addSuccessor(SinkMBB);
   // Fallthrough block for FalseMBB.
   FalseMBB->addSuccessor(SinkMBB);
 
   // Create the conditional branch instruction.
   unsigned Opc = X86::GetCondBranchFromCond(CC);
   BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
 
   //  SinkMBB:
   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
   //  ...
   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
   MachineBasicBlock::iterator MIItEnd =
       std::next(MachineBasicBlock::iterator(LastCMOV));
   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
 
   // Now remove the CMOV(s).
   ThisMBB->erase(MIItBegin, MIItEnd);
 
   return SinkMBB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   // Combine the following atomic floating-point modification pattern:
   //   a.store(reg OP a.load(acquire), release)
   // Transform them into:
   //   OPss (%gpr), %xmm
   //   movss %xmm, (%gpr)
   // Or sd equivalent for 64-bit operations.
   unsigned MOp, FOp;
   switch (MI.getOpcode()) {
   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
   case X86::RELEASE_FADD32mr:
     FOp = X86::ADDSSrm;
     MOp = X86::MOVSSmr;
     break;
   case X86::RELEASE_FADD64mr:
     FOp = X86::ADDSDrm;
     MOp = X86::MOVSDmr;
     break;
   }
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   unsigned ValOpIdx = X86::AddrNumOperands;
   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
   MachineInstrBuilder MIB =
       BuildMI(*BB, MI, DL, TII->get(FOp),
               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
           .addReg(VSrc);
   for (int i = 0; i < X86::AddrNumOperands; ++i) {
     MachineOperand &Operand = MI.getOperand(i);
     // Clear any kill flags on register operands as we'll create a second
     // instruction using the same address operands.
     if (Operand.isReg())
       Operand.setIsKill(false);
     MIB.add(Operand);
   }
   MachineInstr *FOpMI = MIB;
   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
   for (int i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(MI.getOperand(i));
   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   assert(MF->shouldSplitStack());
 
   const bool Is64Bit = Subtarget.is64Bit();
   const bool IsLP64 = Subtarget.isTarget64BitLP64();
 
   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
 
   // BB:
   //  ... [Till the alloca]
   // If stacklet is not large enough, jump to mallocMBB
   //
   // bumpMBB:
   //  Allocate by subtracting from RSP
   //  Jump to continueMBB
   //
   // mallocMBB:
   //  Allocate by call to runtime
   //
   // continueMBB:
   //  ...
   //  [rest of original BB]
   //
 
   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetRegisterClass *AddrRegClass =
       getRegClassFor(getPointerTy(MF->getDataLayout()));
 
   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
            sizeVReg = MI.getOperand(1).getReg(),
            physSPReg =
                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
 
   MachineFunction::iterator MBBIter = ++BB->getIterator();
 
   MF->insert(MBBIter, bumpMBB);
   MF->insert(MBBIter, mallocMBB);
   MF->insert(MBBIter, continueMBB);
 
   continueMBB->splice(continueMBB->begin(), BB,
                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Add code to the main basic block to check if the stack limit has been hit,
   // and if so, jump to mallocMBB otherwise to bumpMBB.
   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
     .addReg(tmpSPVReg).addReg(sizeVReg);
   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
     .addReg(SPLimitVReg);
   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
 
   // bumpMBB simply decreases the stack pointer, since we know the current
   // stacklet has enough space.
   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
     .addReg(SPLimitVReg);
   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
     .addReg(SPLimitVReg);
   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Calls into a routine in libgcc to allocate more space from the heap.
   const uint32_t *RegMask =
       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   if (IsLP64) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::RDI, RegState::Implicit)
       .addReg(X86::RAX, RegState::ImplicitDefine);
   } else if (Is64Bit) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
       .addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::EDI, RegState::Implicit)
       .addReg(X86::EAX, RegState::ImplicitDefine);
   } else {
     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
       .addImm(12);
     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::EAX, RegState::ImplicitDefine);
   }
 
   if (!Is64Bit)
     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
       .addImm(16);
 
   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
     .addReg(IsLP64 ? X86::RAX : X86::EAX);
   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Set up the CFG correctly.
   BB->addSuccessor(bumpMBB);
   BB->addSuccessor(mallocMBB);
   mallocMBB->addSuccessor(continueMBB);
   bumpMBB->addSuccessor(continueMBB);
 
   // Take care of the PHI nodes.
   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
           MI.getOperand(0).getReg())
       .addReg(mallocPtrVReg)
       .addMBB(mallocMBB)
       .addReg(bumpSPPtrVReg)
       .addMBB(bumpMBB);
 
   // Delete the original pseudo instruction.
   MI.eraseFromParent();
 
   // And we're done.
   return continueMBB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
   DebugLoc DL = MI.getDebugLoc();
 
   assert(!isAsynchronousEHPersonality(
              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
          "SEH does not use catchret!");
 
   // Only 32-bit EH needs to worry about manually restoring stack pointers.
   if (!Subtarget.is32Bit())
     return BB;
 
   // C++ EH creates a new target block to hold the restore code, and wires up
   // the new block to the return destination with a normal JMP_4.
   MachineBasicBlock *RestoreMBB =
       MF->CreateMachineBasicBlock(BB->getBasicBlock());
   assert(BB->succ_size() == 1);
   MF->insert(std::next(BB->getIterator()), RestoreMBB);
   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
   BB->addSuccessor(RestoreMBB);
   MI.getOperand(0).setMBB(RestoreMBB);
 
   auto RestoreMBBI = RestoreMBB->begin();
   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const Constant *PerFn = MF->getFunction().getPersonalityFn();
   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
   // Only 32-bit SEH requires special handling for catchpad.
   if (IsSEH && Subtarget.is32Bit()) {
     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
     DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
   }
   MI.eraseFromParent();
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
                                       MachineBasicBlock *BB) const {
   // So, here we replace TLSADDR with the sequence:
   // adjust_stackdown -> TLSADDR -> adjust_stackup.
   // We need this because TLSADDR is lowered into calls
   // inside MC, therefore without the two markers shrink-wrapping
   // may push the prologue/epilogue pass them.
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction &MF = *BB->getParent();
 
   // Emit CALLSEQ_START right before the instruction.
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
 
   // Emit CALLSEQ_END right after the instruction.
   // We don't call erase from parent because we want to keep the
   // original instruction around.
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   MachineInstrBuilder CallseqEnd =
     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
 
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
                                       MachineBasicBlock *BB) const {
   // This is pretty easy.  We're taking the value that we received from
   // our load from the relocation, sticking it in either RDI (x86-64)
   // or EAX and doing an indirect call.  The return value will then
   // be in the normal return register.
   MachineFunction *F = BB->getParent();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
   assert(MI.getOperand(3).isGlobal() && "This should be a global");
 
   // Get a register mask for the lowered call.
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
   const uint32_t *RegMask =
       Subtarget.is64Bit() ?
       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
   if (Subtarget.is64Bit()) {
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
             .addReg(X86::RIP)
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
     addDirectMem(MIB, X86::RDI);
     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else if (!isPositionIndependent()) {
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
             .addReg(0)
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else {
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
             .addReg(TII->getGlobalBaseReg(F))
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   }
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
 
   // Memory Reference
   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   unsigned DstReg;
   unsigned MemOpndSlot = 0;
 
   unsigned CurOp = 0;
 
   DstReg = MI.getOperand(CurOp++).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   (void)TRI;
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
   MemOpndSlot = CurOp;
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
   // For v = setjmp(buf), we generate
   //
   // thisMBB:
   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
   //  SjLjSetup restoreMBB
   //
   // mainMBB:
   //  v_main = 0
   //
   // sinkMBB:
   //  v = phi(main, restore)
   //
   // restoreMBB:
   //  if base pointer being used, load it from frame
   //  v_restore = 1
 
   MachineBasicBlock *thisMBB = MBB;
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, mainMBB);
   MF->insert(I, sinkMBB);
   MF->push_back(restoreMBB);
   restoreMBB->setHasAddressTaken();
 
   MachineInstrBuilder MIB;
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // thisMBB:
   unsigned PtrStoreOpc = 0;
   unsigned LabelReg = 0;
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
                      !isPositionIndependent();
 
   // Prepare IP either in reg or imm.
   if (!UseImmLabel) {
     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
     LabelReg = MRI.createVirtualRegister(PtrRC);
     if (Subtarget.is64Bit()) {
       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
               .addReg(X86::RIP)
               .addImm(0)
               .addReg(0)
               .addMBB(restoreMBB)
               .addReg(0);
     } else {
       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
               .addReg(XII->getGlobalBaseReg(MF))
               .addImm(0)
               .addReg(0)
               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
               .addReg(0);
     }
   } else
     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   // Store IP
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
     else
       MIB.add(MI.getOperand(MemOpndSlot + i));
   }
   if (!UseImmLabel)
     MIB.addReg(LabelReg);
   else
     MIB.addMBB(restoreMBB);
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
 
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
 
   // mainMBB:
   //  EAX = 0
   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
   mainMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
           TII->get(X86::PHI), DstReg)
     .addReg(mainDstReg).addMBB(mainMBB)
     .addReg(restoreDstReg).addMBB(restoreMBB);
 
   // restoreMBB:
   if (RegInfo->hasBasePointer(*MF)) {
     const bool Uses64BitFramePtr =
         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
     X86FI->setRestoreBasePointer(MF);
     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
     unsigned BasePtr = RegInfo->getBaseRegister();
     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
       .setMIFlag(MachineInstr::FrameSetup);
   }
   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   restoreMBB->addSuccessor(sinkMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
   const TargetRegisterClass *RC =
     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
   MachineInstrBuilder MIB;
 
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   const int64_t SPOffset = 2 * PVT.getStoreSize();
 
   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
 
   // Reload FP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(MI.getOperand(i));
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Reload IP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), LabelOffset);
     else
       MIB.add(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Reload SP
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), SPOffset);
     else
       MIB.add(MI.getOperand(i));
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
   // Jump
   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
 
   MI.eraseFromParent();
   return MBB;
 }
 
 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
                                                MachineBasicBlock *MBB,
                                                MachineBasicBlock *DispatchBB,
                                                int FI) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
 
   unsigned Op = 0;
   unsigned VR = 0;
 
   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
                      !isPositionIndependent();
 
   if (UseImmLabel) {
     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   } else {
     const TargetRegisterClass *TRC =
         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
     VR = MRI->createVirtualRegister(TRC);
     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
 
     if (Subtarget.is64Bit())
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
           .addReg(X86::RIP)
           .addImm(1)
           .addReg(0)
           .addMBB(DispatchBB)
           .addReg(0);
     else
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
           .addReg(0) /* TII->getGlobalBaseReg(MF) */
           .addImm(1)
           .addReg(0)
           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
           .addReg(0);
   }
 
   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
   if (UseImmLabel)
     MIB.addMBB(DispatchBB);
   else
     MIB.addReg(VR);
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
                                          MachineBasicBlock *BB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   int FI = MFI.getFunctionContextIndex();
 
   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
   for (auto &MBB : *MF) {
     if (!MBB.isEHPad())
       continue;
 
     MCSymbol *Sym = nullptr;
     for (const auto &MI : MBB) {
       if (MI.isDebugValue())
         continue;
 
       assert(MI.isEHLabel() && "expected EH_LABEL");
       Sym = MI.getOperand(0).getMCSymbol();
       break;
     }
 
     if (!MF->hasCallSiteLandingPad(Sym))
       continue;
 
     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
       CallSiteNumToLPad[CSI].push_back(&MBB);
       MaxCSNum = std::max(MaxCSNum, CSI);
     }
   }
 
   // Get an ordered list of the machine basic blocks for the jump table.
   std::vector<MachineBasicBlock *> LPadList;
   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
   LPadList.reserve(CallSiteNumToLPad.size());
 
   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
     for (auto &LP : CallSiteNumToLPad[CSI]) {
       LPadList.push_back(LP);
       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
     }
   }
 
   assert(!LPadList.empty() &&
          "No landing pad destinations for the dispatch jump table!");
 
   // Create the MBBs for the dispatch code.
 
   // Shove the dispatch's address into the return slot in the function context.
   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
   DispatchBB->setIsEHPad(true);
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
   DispatchBB->addSuccessor(TrapBB);
 
   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
   DispatchBB->addSuccessor(DispContBB);
 
   // Insert MBBs.
   MF->push_back(DispatchBB);
   MF->push_back(DispContBB);
   MF->push_back(TrapBB);
 
   // Insert code into the entry block that creates and registers the function
   // context.
   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
 
   // Create the jump table and associated information
   unsigned JTE = getJumpTableEncoding();
   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
 
   const X86RegisterInfo &RI = TII->getRegisterInfo();
   // Add a register mask with no preserved registers.  This results in all
   // registers being marked as clobbered.
   if (RI.hasBasePointer(*MF)) {
     const bool FPIs64Bit =
         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
     MFI->setRestoreBasePointer(MF);
 
     unsigned FP = RI.getFrameRegister(*MF);
     unsigned BP = RI.getBaseRegister();
     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
                  MFI->getRestoreBasePointerOffset())
         .addRegMask(RI.getNoPreservedMask());
   } else {
     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
         .addRegMask(RI.getNoPreservedMask());
   }
 
   // IReg is used as an index in a memory operand and therefore can't be SP
   unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
                     Subtarget.is64Bit() ? 8 : 4);
   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
       .addReg(IReg)
       .addImm(LPadList.size());
   BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
 
   if (Subtarget.is64Bit()) {
     unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
     unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
 
     // leaq .LJTI0_0(%rip), BReg
     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
         .addReg(X86::RIP)
         .addImm(1)
         .addReg(0)
         .addJumpTableIndex(MJTI)
         .addReg(0);
     // movzx IReg64, IReg
     BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
         .addImm(0)
         .addReg(IReg)
         .addImm(X86::sub_32bit);
 
     switch (JTE) {
     case MachineJumpTableInfo::EK_BlockAddress:
       // jmpq *(BReg,IReg64,8)
       BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
           .addReg(BReg)
           .addImm(8)
           .addReg(IReg64)
           .addImm(0)
           .addReg(0);
       break;
     case MachineJumpTableInfo::EK_LabelDifference32: {
       unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
       unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
       unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
 
       // movl (BReg,IReg64,4), OReg
       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
           .addReg(BReg)
           .addImm(4)
           .addReg(IReg64)
           .addImm(0)
           .addReg(0);
       // movsx OReg64, OReg
       BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
       // addq BReg, OReg64, TReg
       BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
           .addReg(OReg64)
           .addReg(BReg);
       // jmpq *TReg
       BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
       break;
     }
     default:
       llvm_unreachable("Unexpected jump table encoding");
     }
   } else {
     // jmpl *.LJTI0_0(,IReg,4)
     BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
         .addReg(0)
         .addImm(4)
         .addReg(IReg)
         .addJumpTableIndex(MJTI)
         .addReg(0);
   }
 
   // Add the jump table entries as successors to the MBB.
   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
   for (auto &LP : LPadList)
     if (SeenMBBs.insert(LP).second)
       DispContBB->addSuccessor(LP);
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
   SmallVector<MachineBasicBlock *, 64> MBBLPads;
   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
   for (MachineBasicBlock *MBB : InvokeBBs) {
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
     // Keep a copy of Successors since it's modified inside the loop.
     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
                                                    MBB->succ_rend());
     // FIXME: Avoid quadratic complexity.
     for (auto MBBS : Successors) {
       if (MBBS->isEHPad()) {
         MBB->removeSuccessor(MBBS);
         MBBLPads.push_back(MBBS);
       }
     }
 
     MBB->addSuccessor(DispatchBB);
 
     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled.  This prevents code from
     // moving instructions to before the EH block, where they will never be
     // executed.
     for (auto &II : reverse(*MBB)) {
       if (!II.isCall())
         continue;
 
       DenseMap<unsigned, bool> DefRegs;
       for (auto &MOp : II.operands())
         if (MOp.isReg())
           DefRegs[MOp.getReg()] = true;
 
       MachineInstrBuilder MIB(*MF, &II);
       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
         unsigned Reg = SavedRegs[RI];
         if (!DefRegs[Reg])
           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
       }
 
       break;
     }
   }
 
   // Mark all former landing pads as non-landing pads.  The dispatch is the only
   // landing pad now.
   for (auto &LP : MBBLPads)
     LP->setIsEHPad(false);
 
   // The instruction is gone now.
   MI.eraseFromParent();
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   switch (MI.getOpcode()) {
   default: llvm_unreachable("Unexpected instr type to insert");
   case X86::TAILJMPd64:
   case X86::TAILJMPr64:
   case X86::TAILJMPm64:
   case X86::TAILJMPr64_REX:
   case X86::TAILJMPm64_REX:
     llvm_unreachable("TAILJMP64 would not be touched here.");
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64:
     return BB;
   case X86::TLS_addr32:
   case X86::TLS_addr64:
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
     return EmitLoweredTLSAddr(MI, BB);
   case X86::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
   case X86::CATCHPAD:
     return EmitLoweredCatchPad(MI, BB);
   case X86::SEG_ALLOCA_32:
   case X86::SEG_ALLOCA_64:
     return EmitLoweredSegAlloca(MI, BB);
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
   case X86::CMOV_FR128:
   case X86::CMOV_GR8:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
   case X86::CMOV_RFP64:
   case X86::CMOV_RFP80:
   case X86::CMOV_V2F64:
   case X86::CMOV_V2I64:
   case X86::CMOV_V4F32:
   case X86::CMOV_V4F64:
   case X86::CMOV_V4I64:
   case X86::CMOV_V16F32:
   case X86::CMOV_V8F32:
   case X86::CMOV_V8F64:
   case X86::CMOV_V8I64:
   case X86::CMOV_V8I1:
   case X86::CMOV_V16I1:
   case X86::CMOV_V32I1:
   case X86::CMOV_V64I1:
     return EmitLoweredSelect(MI, BB);
 
   case X86::RDFLAGS32:
   case X86::RDFLAGS64: {
     unsigned PushF =
         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
     // Permit reads of the FLAGS register without it being defined.
     // This intrinsic exists to read external processor state in flags, such as
     // the trap flag, interrupt flag, and direction flag, none of which are
     // modeled by the backend.
     Push->getOperand(2).setIsUndef();
     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
 
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
 
   case X86::WRFLAGS32:
   case X86::WRFLAGS64: {
     unsigned Push =
         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
     unsigned PopF =
         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
     BuildMI(*BB, MI, DL, TII->get(PopF));
 
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
 
   case X86::RELEASE_FADD32mr:
   case X86::RELEASE_FADD64mr:
     return EmitLoweredAtomicFP(MI, BB);
 
   case X86::FP32_TO_INT16_IN_MEM:
   case X86::FP32_TO_INT32_IN_MEM:
   case X86::FP32_TO_INT64_IN_MEM:
   case X86::FP64_TO_INT16_IN_MEM:
   case X86::FP64_TO_INT32_IN_MEM:
   case X86::FP64_TO_INT64_IN_MEM:
   case X86::FP80_TO_INT16_IN_MEM:
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
     int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
 
     // Load the old value of the high byte of the control word...
     unsigned OldCW =
       MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                       CWFrameIdx);
 
     // Set the high part to be round to zero...
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
       .addImm(0xC7F);
 
     // Reload the modified control word now...
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FLDCW16m)), CWFrameIdx);
 
     // Restore the memory image of control word to original value
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
       .addReg(OldCW);
 
     // Get the X86 opcode to use.
     unsigned Opc;
     switch (MI.getOpcode()) {
     default: llvm_unreachable("illegal opcode!");
     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
     }
 
     X86AddressMode AM = getAddressFromInstr(&MI, 0);
     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
 
     // Reload the original control word now.
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FLDCW16m)), CWFrameIdx);
 
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
     // String/text processing lowering.
   case X86::PCMPISTRM128REG:
   case X86::VPCMPISTRM128REG:
   case X86::PCMPISTRM128MEM:
   case X86::VPCMPISTRM128MEM:
   case X86::PCMPESTRM128REG:
   case X86::VPCMPESTRM128REG:
   case X86::PCMPESTRM128MEM:
   case X86::VPCMPESTRM128MEM:
     assert(Subtarget.hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
     return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
 
   // String/text processing lowering.
   case X86::PCMPISTRIREG:
   case X86::VPCMPISTRIREG:
   case X86::PCMPISTRIMEM:
   case X86::VPCMPISTRIMEM:
   case X86::PCMPESTRIREG:
   case X86::VPCMPESTRIREG:
   case X86::PCMPESTRIMEM:
   case X86::VPCMPESTRIMEM:
     assert(Subtarget.hasSSE42() &&
            "Target must have SSE4.2 or AVX features enabled");
     return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
 
   // Thread synchronization.
   case X86::MONITOR:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
   case X86::MONITORX:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
 
   // Cache line zero
   case X86::CLZERO:
     return emitClzero(&MI, BB, Subtarget);
 
   // PKU feature
   case X86::WRPKRU:
     return emitWRPKRU(MI, BB, Subtarget);
   case X86::RDPKRU:
     return emitRDPKRU(MI, BB, Subtarget);
   // xbegin
   case X86::XBEGIN:
     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
 
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
 
   case X86::VAARG_64:
     return EmitVAARG64WithCustomInserter(MI, BB);
 
   case X86::EH_SjLj_SetJmp32:
   case X86::EH_SjLj_SetJmp64:
     return emitEHSjLjSetJmp(MI, BB);
 
   case X86::EH_SjLj_LongJmp32:
   case X86::EH_SjLj_LongJmp64:
     return emitEHSjLjLongJmp(MI, BB);
 
   case X86::Int_eh_sjlj_setup_dispatch:
     return EmitSjLjDispatchBlock(MI, BB);
 
   case TargetOpcode::STATEPOINT:
     // As an implementation detail, STATEPOINT shares the STACKMAP format at
     // this point in the process.  We diverge later.
     return emitPatchPoint(MI, BB);
 
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
 
   case TargetOpcode::PATCHABLE_EVENT_CALL:
     // Do nothing here, handle in xray instrumentation pass.
     return BB;
 
   case X86::LCMPXCHG8B: {
     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
     // requires a memory operand. If it happens that current architecture is
     // i686 and for current function we need a base pointer
     // - which is ESI for i686 - register allocator would not be able to
     // allocate registers for an address in form of X(%reg, %reg, Y)
     // - there never would be enough unreserved registers during regalloc
     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
     // We are giving a hand to register allocator by precomputing the address in
     // a new vreg using LEA.
 
     // If it is not i686 or there is no base pointer - nothing to do here.
     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
       return BB;
 
     // Even though this code does not necessarily needs the base pointer to
     // be ESI, we check for that. The reason: if this assert fails, there are
     // some changes happened in the compiler base pointer handling, which most
     // probably have to be addressed somehow here.
     assert(TRI->getBaseRegister() == X86::ESI &&
            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
            "base pointer in mind");
 
     MachineRegisterInfo &MRI = MF->getRegInfo();
     MVT SPTy = getPointerTy(MF->getDataLayout());
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
 
     X86AddressMode AM = getAddressFromInstr(&MI, 0);
     // Regalloc does not need any help when the memory operand of CMPXCHG8B
     // does not use index register.
     if (AM.IndexReg == X86::NoRegister)
       return BB;
 
     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
     // four operand definitions that are E[ABCD] registers. We skip them and
     // then insert the LEA.
     MachineBasicBlock::iterator MBBI(MI);
     while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
            MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
       --MBBI;
     addFullAddress(
         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
 
     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
 
     return BB;
   }
   case X86::LCMPXCHG16B:
     return BB;
   case X86::LCMPXCHG8B_SAVE_EBX:
   case X86::LCMPXCHG16B_SAVE_RBX: {
     unsigned BasePtr =
         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
     if (!BB->isLiveIn(BasePtr))
       BB->addLiveIn(BasePtr);
     return BB;
   }
   }
 }
 
 //===----------------------------------------------------------------------===//
 //                           X86 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   unsigned BitWidth = Known.getBitWidth();
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
   assert((Opc >= ISD::BUILTIN_OP_END ||
           Opc == ISD::INTRINSIC_WO_CHAIN ||
           Opc == ISD::INTRINSIC_W_CHAIN ||
           Opc == ISD::INTRINSIC_VOID) &&
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
 
   Known.resetAll();
   switch (Opc) {
   default: break;
   case X86ISD::SETCC:
     Known.Zero.setBitsFrom(1);
     break;
   case X86ISD::MOVMSK: {
     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
     Known.Zero.setBitsFrom(NumLoBits);
     break;
   }
   case X86ISD::PEXTRB:
   case X86ISD::PEXTRW: {
     SDValue Src = Op.getOperand(0);
     EVT SrcVT = Src.getValueType();
     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
                                             Op.getConstantOperandVal(1));
     DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
     Known = Known.zextOrTrunc(BitWidth);
     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
     break;
   }
   case X86ISD::VSHLI:
   case X86ISD::VSRLI: {
     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
         Known.setAllZero();
         break;
       }
 
       DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
       unsigned ShAmt = ShiftImm->getZExtValue();
       if (Opc == X86ISD::VSHLI) {
         Known.Zero <<= ShAmt;
         Known.One <<= ShAmt;
         // Low bits are known zero.
         Known.Zero.setLowBits(ShAmt);
       } else {
         Known.Zero.lshrInPlace(ShAmt);
         Known.One.lshrInPlace(ShAmt);
         // High bits are known zero.
         Known.Zero.setHighBits(ShAmt);
       }
     }
     break;
   }
   case X86ISD::VZEXT: {
     // TODO: Add DemandedElts support.
     SDValue N0 = Op.getOperand(0);
     unsigned NumElts = VT.getVectorNumElements();
 
     EVT SrcVT = N0.getValueType();
     unsigned InNumElts = SrcVT.getVectorNumElements();
     unsigned InBitWidth = SrcVT.getScalarSizeInBits();
     assert(InNumElts >= NumElts && "Illegal VZEXT input");
 
     Known = KnownBits(InBitWidth);
     APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
     DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
     Known = Known.zext(BitWidth);
     Known.Zero.setBitsFrom(InBitWidth);
     break;
   }
   case X86ISD::CMOV: {
     DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
     KnownBits Known2;
     DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
     Known.Zero &= Known2.Zero;
     break;
   }
   case X86ISD::UDIVREM8_ZEXT_HREG:
     // TODO: Support more than just the zero extended bits?
     if (Op.getResNo() != 1)
       break;
     // The remainder is zero extended.
     Known.Zero.setBitsFrom(8);
     break;
   }
 }
 
 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     unsigned Depth) const {
   unsigned VTBits = Op.getScalarValueSizeInBits();
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
   case X86ISD::SETCC_CARRY:
     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
     return VTBits;
 
   case X86ISD::VSEXT: {
     // TODO: Add DemandedElts support.
     SDValue Src = Op.getOperand(0);
     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
     Tmp += VTBits - Src.getScalarValueSizeInBits();
     return Tmp;
   }
 
   case X86ISD::VTRUNC: {
     // TODO: Add DemandedElts support.
     SDValue Src = Op.getOperand(0);
     unsigned NumSrcBits = Src.getScalarValueSizeInBits();
     assert(VTBits < NumSrcBits && "Illegal truncation input type");
     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
     if (Tmp > (NumSrcBits - VTBits))
       return Tmp - (NumSrcBits - VTBits);
     return 1;
   }
 
   case X86ISD::PACKSS: {
     // PACKSS is just a truncation if the sign bits extend to the packed size.
     // TODO: Add DemandedElts support.
     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
     unsigned Tmp = std::min(Tmp0, Tmp1);
     if (Tmp > (SrcBits - VTBits))
       return Tmp - (SrcBits - VTBits);
     return 1;
   }
 
   case X86ISD::VSHLI: {
     SDValue Src = Op.getOperand(0);
     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
     if (ShiftVal.uge(VTBits))
       return VTBits; // Shifted all bits out --> zero.
     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
     if (ShiftVal.uge(Tmp))
       return 1; // Shifted all sign bits out --> unknown.
     return Tmp - ShiftVal.getZExtValue();
   }
 
   case X86ISD::VSRAI: {
     SDValue Src = Op.getOperand(0);
     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
     if (ShiftVal.uge(VTBits - 1))
       return VTBits; // Sign splat.
     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
     ShiftVal += Tmp;
     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
   }
 
   case X86ISD::PCMPGT:
   case X86ISD::PCMPEQ:
   case X86ISD::CMPP:
   case X86ISD::VPCOM:
   case X86ISD::VPCOMU:
     // Vector compares return zero/all-bits result values.
     return VTBits;
 
   case X86ISD::CMOV: {
     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
     if (Tmp0 == 1) return 1;  // Early out.
     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
     return std::min(Tmp0, Tmp1);
   }
   case X86ISD::SDIVREM8_SEXT_HREG:
     // TODO: Support more than just the sign extended bits?
     if (Op.getResNo() != 1)
       break;
     // The remainder is sign extended.
     return VTBits - 7;
   }
 
   // Fallback case.
   return 1;
 }
 
 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
     return N->getOperand(0);
   return N;
 }
 
 /// Returns true (and the GlobalValue and the offset) if the node is a
 /// GlobalAddress + offset.
 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
                                        const GlobalValue* &GA,
                                        int64_t &Offset) const {
   if (N->getOpcode() == X86ISD::Wrapper) {
     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
       return true;
     }
   }
   return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
 // Attempt to match a combined shuffle mask against supported unary shuffle
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                     bool AllowFloatDomain, bool AllowIntDomain,
                                     SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget,
                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
   // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
     unsigned MaxScale = 64 / MaskEltSize;
     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
       bool Match = true;
       unsigned NumDstElts = NumMaskElts / Scale;
       for (unsigned i = 0; i != NumDstElts && Match; ++i) {
         Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
       }
       if (Match) {
         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
                                             MVT::getIntegerVT(MaskEltSize);
         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
 
         if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
           Shuffle = unsigned(X86ISD::VZEXT);
         } else
           Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
 
         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
         return true;
       }
     }
   }
 
   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
       isUndefOrEqual(Mask[0], 0) &&
       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
     Shuffle = X86ISD::VZEXT_MOVL;
     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
     return true;
   }
 
   // Check if we have SSE3 which will let us use MOVDDUP etc. The
   // instructions are no slower than UNPCKLPD but has the option to
   // fold the input operand into even an unaligned memory load.
   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
     if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v2f64;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
   }
 
   if (MaskVT.is256BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v4f64;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v8f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v8f32;
       return true;
     }
   }
 
   if (MaskVT.is512BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX512() &&
            "AVX512 required for 512-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v8f64;
       return true;
     }
     if (isTargetShuffleEquivalent(
             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v16f32;
       return true;
     }
     if (isTargetShuffleEquivalent(
             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v16f32;
       return true;
     }
   }
 
   // Attempt to match against broadcast-from-vector.
   if (Subtarget.hasAVX2()) {
     SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
       SrcVT = DstVT = MaskVT;
       Shuffle = X86ISD::VBROADCAST;
       return true;
     }
   }
 
   return false;
 }
 
 // Attempt to match a combined shuffle mask against supported unary immediate
 // permute instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                            const APInt &Zeroable,
                                            bool AllowFloatDomain,
                                            bool AllowIntDomain,
                                            const X86Subtarget &Subtarget,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
   unsigned InputSizeInBits = MaskVT.getSizeInBits();
   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
 
   bool ContainsZeros =
       llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
 
   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
     // Check for lane crossing permutes.
     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
         Shuffle = X86ISD::VPERMI;
         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
         PermuteImm = getV4X86ShuffleImm(Mask);
         return true;
       }
       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
         SmallVector<int, 4> RepeatedMask;
         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
           Shuffle = X86ISD::VPERMI;
           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
           return true;
         }
       }
     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
       // VPERMILPD can permute with a non-repeating shuffle.
       Shuffle = X86ISD::VPERMILPI;
       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
       PermuteImm = 0;
       for (int i = 0, e = Mask.size(); i != e; ++i) {
         int M = Mask[i];
         if (M == SM_SentinelUndef)
           continue;
         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
         PermuteImm |= (M & 1) << i;
       }
       return true;
     }
   }
 
   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
     SmallVector<int, 4> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
       // Narrow the repeated mask to create 32-bit element permutes.
       SmallVector<int, 4> WordMask = RepeatedMask;
       if (MaskScalarSizeInBits == 64)
         scaleShuffleMask<int>(2, RepeatedMask, WordMask);
 
       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
       PermuteImm = getV4X86ShuffleImm(WordMask);
       return true;
     }
   }
 
   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
     SmallVector<int, 4> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
       ArrayRef<int> LoMask(Mask.data() + 0, 4);
       ArrayRef<int> HiMask(Mask.data() + 4, 4);
 
       // PSHUFLW: permute lower 4 elements only.
       if (isUndefOrInRange(LoMask, 0, 4) &&
           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
         Shuffle = X86ISD::PSHUFLW;
         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
         PermuteImm = getV4X86ShuffleImm(LoMask);
         return true;
       }
 
       // PSHUFHW: permute upper 4 elements only.
       if (isUndefOrInRange(HiMask, 4, 8) &&
           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
         // Offset the HiMask so that we can create the shuffle immediate.
         int OffsetHiMask[4];
         for (int i = 0; i != 4; ++i)
           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
 
         Shuffle = X86ISD::PSHUFHW;
         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
         return true;
       }
     }
   }
 
   // Attempt to match against byte/bit shifts.
   // FIXME: Add 512-bit support.
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
                                              MaskScalarSizeInBits, Mask,
                                              0, Zeroable, Subtarget);
     if (0 < ShiftAmt) {
       PermuteImm = (unsigned)ShiftAmt;
       return true;
     }
   }
 
   return false;
 }
 
 // Attempt to match a combined unary shuffle mask against supported binary
 // shuffle instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                      bool AllowFloatDomain, bool AllowIntDomain,
                                      SDValue &V1, SDValue &V2, SDLoc &DL,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget,
                                      unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
                                      bool IsUnary) {
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   if (MaskVT.is128BitVector()) {
     if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = X86ISD::MOVLHPS;
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = X86ISD::MOVHLPS;
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
         (AllowFloatDomain || !Subtarget.hasSSE41())) {
       std::swap(V1, V2);
       Shuffle = X86ISD::MOVSD;
       SrcVT = DstVT = MaskVT;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
         (AllowFloatDomain || !Subtarget.hasSSE41())) {
       Shuffle = X86ISD::MOVSS;
       SrcVT = DstVT = MaskVT;
       return true;
     }
   }
 
   // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
   // TODO add support for 256/512-bit types.
   if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
     if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
                                    Subtarget)) {
       DstVT = MaskVT;
       return true;
     }
   }
 
   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
     if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
                                     DAG, Subtarget)) {
       SrcVT = DstVT = MaskVT;
       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
       return true;
     }
   }
 
   return false;
 }
 
 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                             const APInt &Zeroable,
                                             bool AllowFloatDomain,
                                             bool AllowIntDomain,
                                             SDValue &V1, SDValue &V2, SDLoc &DL,
                                             SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget,
                                             unsigned &Shuffle, MVT &ShuffleVT,
                                             unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   // Attempt to match against PALIGNR byte rotate.
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
     if (0 < ByteRotation) {
       Shuffle = X86ISD::PALIGNR;
       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
       PermuteImm = ByteRotation;
       return true;
     }
   }
 
   // Attempt to combine to X86ISD::BLENDI.
   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
     uint64_t BlendMask = 0;
     bool ForceV1Zero = false, ForceV2Zero = false;
     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
     if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
                                   BlendMask)) {
       if (MaskVT == MVT::v16i16) {
         // We can only use v16i16 PBLENDW if the lanes are repeated.
         SmallVector<int, 8> RepeatedMask;
         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
                                         RepeatedMask)) {
           assert(RepeatedMask.size() == 8 &&
                  "Repeated mask size doesn't match!");
           PermuteImm = 0;
           for (int i = 0; i < 8; ++i)
             if (RepeatedMask[i] >= 8)
               PermuteImm |= 1 << i;
           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
           Shuffle = X86ISD::BLENDI;
           ShuffleVT = MaskVT;
           return true;
         }
       } else {
         // Determine a type compatible with X86ISD::BLENDI.
         ShuffleVT = MaskVT;
         if (Subtarget.hasAVX2()) {
           if (ShuffleVT == MVT::v4i64)
             ShuffleVT = MVT::v8i32;
           else if (ShuffleVT == MVT::v2i64)
             ShuffleVT = MVT::v4i32;
         } else {
           if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
             ShuffleVT = MVT::v8i16;
           else if (ShuffleVT == MVT::v4i64)
             ShuffleVT = MVT::v4f64;
           else if (ShuffleVT == MVT::v8i32)
             ShuffleVT = MVT::v8f32;
         }
 
         if (!ShuffleVT.isFloatingPoint()) {
           int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
           BlendMask =
               scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
           ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
           ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
         }
 
         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
         PermuteImm = (unsigned)BlendMask;
         Shuffle = X86ISD::BLENDI;
         return true;
       }
     }
   }
 
   // Attempt to combine to INSERTPS.
   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
       MaskVT.is128BitVector()) {
     if (Zeroable.getBoolValue() &&
         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
       Shuffle = X86ISD::INSERTPS;
       ShuffleVT = MVT::v4f32;
       return true;
     }
   }
 
   // Attempt to combine to SHUFPD.
   if (AllowFloatDomain && EltSizeInBits == 64 &&
       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
       Shuffle = X86ISD::SHUFP;
       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
       return true;
     }
   }
 
   // Attempt to combine to SHUFPS.
   if (AllowFloatDomain && EltSizeInBits == 32 &&
       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     SmallVector<int, 4> RepeatedMask;
     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
       // Match each half of the repeated mask, to determine if its just
       // referencing one of the vectors, is zeroable or entirely undef.
       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
         int M0 = RepeatedMask[Offset];
         int M1 = RepeatedMask[Offset + 1];
 
         if (isUndefInRange(RepeatedMask, Offset, 2)) {
           return DAG.getUNDEF(MaskVT);
         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
           return getZeroVector(MaskVT, Subtarget, DAG, DL);
         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
           return V1;
         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
           return V2;
         }
 
         return SDValue();
       };
 
       int ShufMask[4] = {-1, -1, -1, -1};
       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
 
       if (Lo && Hi) {
         V1 = Lo;
         V2 = Hi;
         Shuffle = X86ISD::SHUFP;
         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
         PermuteImm = getV4X86ShuffleImm(ShufMask);
         return true;
       }
     }
   }
 
   return false;
 }
 
 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
 /// possible.
 ///
 /// This is the leaf of the recursive combine below. When we have found some
 /// chain of single-use x86 shuffle instructions and accumulated the combined
 /// shuffle mask represented by them, this will try to pattern match that mask
 /// into either a single instruction if there is a special purpose instruction
 /// for this operation, or into a PSHUFB instruction which is a fully general
 /// instruction but should only be used to replace chains over a certain depth.
 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                       ArrayRef<int> BaseMask, int Depth,
                                       bool HasVariableMask, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const X86Subtarget &Subtarget) {
   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
          "Unexpected number of shuffle inputs!");
 
   // Find the inputs that enter the chain. Note that multiple uses are OK
   // here, we're not going to remove the operands we find.
   bool UnaryShuffle = (Inputs.size() == 1);
   SDValue V1 = peekThroughBitcasts(Inputs[0]);
   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
                              : peekThroughBitcasts(Inputs[1]));
 
   MVT VT1 = V1.getSimpleValueType();
   MVT VT2 = V2.getSimpleValueType();
   MVT RootVT = Root.getSimpleValueType();
   assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
          VT2.getSizeInBits() == RootVT.getSizeInBits() &&
          "Vector size mismatch");
 
   SDLoc DL(Root);
   SDValue Res;
 
   unsigned NumBaseMaskElts = BaseMask.size();
   if (NumBaseMaskElts == 1) {
     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
     return DAG.getBitcast(RootVT, V1);
   }
 
   unsigned RootSizeInBits = RootVT.getSizeInBits();
   unsigned NumRootElts = RootVT.getVectorNumElements();
   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
 
   // Don't combine if we are a AVX512/EVEX target and the mask element size
   // is different from the root element size - this would prevent writemasks
   // from being reused.
   // TODO - this currently prevents all lane shuffles from occurring.
   // TODO - check for writemasks usage instead of always preventing combining.
   // TODO - attempt to narrow Mask back to writemask size.
   bool IsEVEXShuffle =
       RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
 
   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
 
   // Handle 128-bit lane shuffles of 256-bit vectors.
   // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
   // we need to use the zeroing feature.
   // TODO - this should support binary shuffles.
   if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
       !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
       return SDValue(); // Nothing to do!
     MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
     unsigned PermMask = 0;
     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
 
     Res = DAG.getBitcast(ShuffleVT, V1);
     DCI.AddToWorklist(Res.getNode());
     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
                       DAG.getUNDEF(ShuffleVT),
                       DAG.getConstant(PermMask, DL, MVT::i8));
     DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
   // For masks that have been widened to 128-bit elements or more,
   // narrow back down to 64-bit elements.
   SmallVector<int, 64> Mask;
   if (BaseMaskEltSizeInBits > 64) {
     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
     int MaskScale = BaseMaskEltSizeInBits / 64;
     scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
   } else {
     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
   }
 
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
 
   // Determine the effective mask value type.
   FloatDomain &= (32 <= MaskEltSizeInBits);
   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
                            : MVT::getIntegerVT(MaskEltSizeInBits);
   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
 
   // Only allow legal mask types.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
     return SDValue();
 
   // Attempt to match the mask against known shuffle patterns.
   MVT ShuffleSrcVT, ShuffleVT;
   unsigned Shuffle, PermuteImm;
 
   // Which shuffle domains are permitted?
   // Permit domain crossing at higher combine depths.
   bool AllowFloatDomain = FloatDomain || (Depth > 3);
   bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
 
   // Determine zeroable mask elements.
   APInt Zeroable(NumMaskElts, 0);
   for (unsigned i = 0; i != NumMaskElts; ++i)
     if (isUndefOrZero(Mask[i]))
       Zeroable.setBit(i);
 
   if (UnaryShuffle) {
     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
     // directly if we don't shuffle the lower element and we shuffle the upper
     // (zero) elements within themselves.
     if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
         (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
       unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
       ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
       if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
           isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
         return DAG.getBitcast(RootVT, V1);
       }
     }
 
     SDValue NewV1 = V1; // Save operand in case early exit happens.
     if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
                                 NewV1, DL, DAG, Subtarget, Shuffle,
                                 ShuffleSrcVT, ShuffleVT) &&
         (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
       DCI.AddToWorklist(Res.getNode());
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
       DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
 
     if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
                                        AllowIntDomain, Subtarget, Shuffle,
                                        ShuffleVT, PermuteImm) &&
         (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleVT, V1);
       DCI.AddToWorklist(Res.getNode());
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
                         DAG.getConstant(PermuteImm, DL, MVT::i8));
       DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
   }
 
   SDValue NewV1 = V1; // Save operands in case early exit happens.
   SDValue NewV2 = V2;
   if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
                                NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
                                ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
       (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
     DCI.AddToWorklist(NewV1.getNode());
     NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
     DCI.AddToWorklist(NewV2.getNode());
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
     DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
   NewV1 = V1; // Save operands in case early exit happens.
   NewV2 = V2;
   if (matchBinaryPermuteVectorShuffle(
           MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
           NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
       (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
     DCI.AddToWorklist(NewV1.getNode());
     NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
     DCI.AddToWorklist(NewV2.getNode());
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
                       DAG.getConstant(PermuteImm, DL, MVT::i8));
     DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
   // Typically from here on, we need an integer version of MaskVT.
   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
 
   // Annoyingly, SSE4A instructions don't map into the above match helpers.
   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
     uint64_t BitLen, BitIdx;
     if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
                                   Zeroable)) {
       if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
         return SDValue(); // Nothing to do!
       V1 = DAG.getBitcast(IntMaskVT, V1);
       DCI.AddToWorklist(V1.getNode());
       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
                         DAG.getConstant(BitLen, DL, MVT::i8),
                         DAG.getConstant(BitIdx, DL, MVT::i8));
       DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
 
     if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
         return SDValue(); // Nothing to do!
       V1 = DAG.getBitcast(IntMaskVT, V1);
       DCI.AddToWorklist(V1.getNode());
       V2 = DAG.getBitcast(IntMaskVT, V2);
       DCI.AddToWorklist(V2.getNode());
       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
                         DAG.getConstant(BitLen, DL, MVT::i8),
                         DAG.getConstant(BitIdx, DL, MVT::i8));
       DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
   }
 
   // Don't try to re-form single instruction chains under any circumstances now
   // that we've done encoding canonicalization for them.
   if (Depth < 2)
     return SDValue();
 
   // Depth threshold above which we can efficiently use variable mask shuffles.
   int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
   bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
 
   bool MaskContainsZeros =
       any_of(Mask, [](int M) { return M == SM_SentinelZero; });
 
   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
     // If we have a single input lane-crossing shuffle then lower to VPERMV.
     if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
         ((Subtarget.hasAVX2() &&
           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
          (Subtarget.hasAVX512() &&
           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       DCI.AddToWorklist(VPermMask.getNode());
       Res = DAG.getBitcast(MaskVT, V1);
       DCI.AddToWorklist(Res.getNode());
       Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
       DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
 
     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
     // vector as the second source.
     if (UnaryShuffle && AllowVariableMask &&
         ((Subtarget.hasAVX512() &&
           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
          (Subtarget.hasVLX() &&
           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
       for (unsigned i = 0; i != NumMaskElts; ++i)
         if (Mask[i] == SM_SentinelZero)
           Mask[i] = NumMaskElts + i;
 
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       DCI.AddToWorklist(VPermMask.getNode());
       Res = DAG.getBitcast(MaskVT, V1);
       DCI.AddToWorklist(Res.getNode());
       SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
       DCI.AddToWorklist(Zero.getNode());
       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
       DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
 
     // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
     if (AllowVariableMask && !MaskContainsZeros &&
         ((Subtarget.hasAVX512() &&
           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
          (Subtarget.hasVLX() &&
           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       DCI.AddToWorklist(VPermMask.getNode());
       V1 = DAG.getBitcast(MaskVT, V1);
       DCI.AddToWorklist(V1.getNode());
       V2 = DAG.getBitcast(MaskVT, V2);
       DCI.AddToWorklist(V2.getNode());
       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
       DCI.AddToWorklist(Res.getNode());
       return DAG.getBitcast(RootVT, Res);
     }
     return SDValue();
   }
 
   // See if we can combine a single input shuffle with zeros to a bit-mask,
   // which is much simpler than any shuffle.
   if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
     APInt UndefElts(NumMaskElts, 0);
     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
     for (unsigned i = 0; i != NumMaskElts; ++i) {
       int M = Mask[i];
       if (M == SM_SentinelUndef) {
         UndefElts.setBit(i);
         continue;
       }
       if (M == SM_SentinelZero)
         continue;
       EltBits[i] = AllOnes;
     }
     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
     DCI.AddToWorklist(BitMask.getNode());
     Res = DAG.getBitcast(MaskVT, V1);
     DCI.AddToWorklist(Res.getNode());
     unsigned AndOpcode =
         FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
     DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
   // If we have a single input shuffle with different shuffle patterns in the
   // the 128-bit lanes use the variable mask to VPERMILPS.
   // TODO Combine other mask types at higher depths.
   if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
     SmallVector<SDValue, 16> VPermIdx;
     for (int M : Mask) {
       SDValue Idx =
           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
       VPermIdx.push_back(Idx);
     }
     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
     DCI.AddToWorklist(VPermMask.getNode());
     Res = DAG.getBitcast(MaskVT, V1);
     DCI.AddToWorklist(Res.getNode());
     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
     DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
   // to VPERMIL2PD/VPERMIL2PS.
   if (AllowVariableMask && Subtarget.hasXOP() &&
       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
        MaskVT == MVT::v8f32)) {
     // VPERMIL2 Operation.
     // Bits[3] - Match Bit.
     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
     SmallVector<int, 8> VPerm2Idx;
     unsigned M2ZImm = 0;
     for (int M : Mask) {
       if (M == SM_SentinelUndef) {
         VPerm2Idx.push_back(-1);
         continue;
       }
       if (M == SM_SentinelZero) {
         M2ZImm = 2;
         VPerm2Idx.push_back(8);
         continue;
       }
       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
       VPerm2Idx.push_back(Index);
     }
     V1 = DAG.getBitcast(MaskVT, V1);
     DCI.AddToWorklist(V1.getNode());
     V2 = DAG.getBitcast(MaskVT, V2);
     DCI.AddToWorklist(V2.getNode());
     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
     DCI.AddToWorklist(VPerm2MaskOp.getNode());
     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
                       DAG.getConstant(M2ZImm, DL, MVT::i8));
     DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
   // If we have 3 or more shuffle instructions or a chain involving a variable
   // mask, we can replace them with a single PSHUFB instruction profitably.
   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
   // instructions, but in practice PSHUFB tends to be *very* fast so we're
   // more aggressive.
   if (UnaryShuffle && AllowVariableMask &&
       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
     SmallVector<SDValue, 16> PSHUFBMask;
     int NumBytes = RootVT.getSizeInBits() / 8;
     int Ratio = NumBytes / NumMaskElts;
     for (int i = 0; i < NumBytes; ++i) {
       int M = Mask[i / Ratio];
       if (M == SM_SentinelUndef) {
         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
         continue;
       }
       if (M == SM_SentinelZero) {
         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
         continue;
       }
       M = Ratio * M + i % Ratio;
       assert((M / 16) == (i / 16) && "Lane crossing detected");
       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
     }
     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
     Res = DAG.getBitcast(ByteVT, V1);
     DCI.AddToWorklist(Res.getNode());
     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
     DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
   // With XOP, if we have a 128-bit binary input shuffle we can always combine
   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
   // slower than PSHUFB on targets that support both.
   if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
     // VPPERM Mask Operation
     // Bits[4:0] - Byte Index (0 - 31)
     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
     SmallVector<SDValue, 16> VPPERMMask;
     int NumBytes = 16;
     int Ratio = NumBytes / NumMaskElts;
     for (int i = 0; i < NumBytes; ++i) {
       int M = Mask[i / Ratio];
       if (M == SM_SentinelUndef) {
         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
         continue;
       }
       if (M == SM_SentinelZero) {
         VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
         continue;
       }
       M = Ratio * M + i % Ratio;
       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
     }
     MVT ByteVT = MVT::v16i8;
     V1 = DAG.getBitcast(ByteVT, V1);
     DCI.AddToWorklist(V1.getNode());
     V2 = DAG.getBitcast(ByteVT, V2);
     DCI.AddToWorklist(V2.getNode());
     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
     DCI.AddToWorklist(VPPERMMaskOp.getNode());
     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
     DCI.AddToWorklist(Res.getNode());
     return DAG.getBitcast(RootVT, Res);
   }
 
   // Failed to find any combines.
   return SDValue();
 }
 
 // Attempt to constant fold all of the constant source ops.
 // Returns true if the entire shuffle is folded to a constant.
 // TODO: Extend this to merge multiple constant Ops and update the mask.
 static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
                                            ArrayRef<int> Mask, SDValue Root,
                                            bool HasVariableMask,
                                            SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
   MVT VT = Root.getSimpleValueType();
 
   unsigned SizeInBits = VT.getSizeInBits();
   unsigned NumMaskElts = Mask.size();
   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
   unsigned NumOps = Ops.size();
 
   // Extract constant bits from each source op.
   bool OneUseConstantOp = false;
   SmallVector<APInt, 16> UndefEltsOps(NumOps);
   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
   for (unsigned i = 0; i != NumOps; ++i) {
     SDValue SrcOp = Ops[i];
     OneUseConstantOp |= SrcOp.hasOneUse();
     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
                                        RawBitsOps[i]))
       return SDValue();
   }
 
   // Only fold if at least one of the constants is only used once or
   // the combined shuffle has included a variable mask shuffle, this
   // is to avoid constant pool bloat.
   if (!OneUseConstantOp && !HasVariableMask)
     return SDValue();
 
   // Shuffle the constant bits according to the mask.
   APInt UndefElts(NumMaskElts, 0);
   APInt ZeroElts(NumMaskElts, 0);
   APInt ConstantElts(NumMaskElts, 0);
   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
                                         APInt::getNullValue(MaskSizeInBits));
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef) {
       UndefElts.setBit(i);
       continue;
     } else if (M == SM_SentinelZero) {
       ZeroElts.setBit(i);
       continue;
     }
     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
 
     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
 
     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
     if (SrcUndefElts[SrcMaskIdx]) {
       UndefElts.setBit(i);
       continue;
     }
 
     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
     APInt &Bits = SrcEltBits[SrcMaskIdx];
     if (!Bits) {
       ZeroElts.setBit(i);
       continue;
     }
 
     ConstantElts.setBit(i);
     ConstantBitData[i] = Bits;
   }
   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
 
   // Create the constant data.
   MVT MaskSVT;
   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
   else
     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
 
   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
 
   SDLoc DL(Root);
   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
   DCI.AddToWorklist(CstOp.getNode());
   return DAG.getBitcast(VT, CstOp);
 }
 
 /// \brief Fully generic combining of x86 shuffle instructions.
 ///
 /// This should be the last combine run over the x86 shuffle instructions. Once
 /// they have been fully optimized, this will recursively consider all chains
 /// of single-use shuffle instructions, build a generic model of the cumulative
 /// shuffle operation, and check for simpler instructions which implement this
 /// operation. We use this primarily for two purposes:
 ///
 /// 1) Collapse generic shuffles to specialized single instructions when
 ///    equivalent. In most cases, this is just an encoding size win, but
 ///    sometimes we will collapse multiple generic shuffles into a single
 ///    special-purpose shuffle.
 /// 2) Look for sequences of shuffle instructions with 3 or more total
 ///    instructions, and replace them with the slightly more expensive SSSE3
 ///    PSHUFB instruction if available. We do this as the last combining step
 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
 ///    a suitable short sequence of other instructions. The PSHUFB will either
 ///    use a register or have to read from memory and so is slightly (but only
 ///    slightly) more expensive than the other shuffle instructions.
 ///
 /// Because this is inherently a quadratic operation (for each shuffle in
 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
 /// This should never be an issue in practice as the shuffle lowering doesn't
 /// produce sequences of more than 8 instructions.
 ///
 /// FIXME: We will currently miss some cases where the redundant shuffling
 /// would simplify under the threshold for PSHUFB formation because of
 /// combine-ordering. To fix this, we should do the redundant instruction
 /// combining in this recursive walk.
 static SDValue combineX86ShufflesRecursively(
     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
     bool HasVariableMask, SelectionDAG &DAG,
     TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
   // Bound the depth of our recursive combine because this is ultimately
   // quadratic in nature.
   if (Depth > 8)
     return SDValue();
 
   // Directly rip through bitcasts to find the underlying operand.
   SDValue Op = SrcOps[SrcOpIndex];
   Op = peekThroughOneUseBitcasts(Op);
 
   MVT VT = Op.getSimpleValueType();
   if (!VT.isVector())
     return SDValue(); // Bail if we hit a non-vector.
 
   assert(Root.getSimpleValueType().isVector() &&
          "Shuffles operate on vector types!");
   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
          "Can only combine shuffles of the same vector register size.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
   SmallVector<int, 64> OpMask;
   SmallVector<SDValue, 2> OpInputs;
   if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
     return SDValue();
 
   assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
   SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
   SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
 
   // Add the inputs to the Ops list, avoiding duplicates.
   SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
 
   int InputIdx0 = -1, InputIdx1 = -1;
   for (int i = 0, e = Ops.size(); i < e; ++i) {
     SDValue BC = peekThroughBitcasts(Ops[i]);
     if (Input0 && BC == peekThroughBitcasts(Input0))
       InputIdx0 = i;
     if (Input1 && BC == peekThroughBitcasts(Input1))
       InputIdx1 = i;
   }
 
   if (Input0 && InputIdx0 < 0) {
     InputIdx0 = SrcOpIndex;
     Ops[SrcOpIndex] = Input0;
   }
   if (Input1 && InputIdx1 < 0) {
     InputIdx1 = Ops.size();
     Ops.push_back(Input1);
   }
 
   assert(((RootMask.size() > OpMask.size() &&
            RootMask.size() % OpMask.size() == 0) ||
           (OpMask.size() > RootMask.size() &&
            OpMask.size() % RootMask.size() == 0) ||
           OpMask.size() == RootMask.size()) &&
          "The smaller number of elements must divide the larger.");
 
   // This function can be performance-critical, so we rely on the power-of-2
   // knowledge that we have about the mask sizes to replace div/rem ops with
   // bit-masks and shifts.
   assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
   assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
   unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
   unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
 
   unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
   unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
   unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
   assert((RootRatio == 1 || OpRatio == 1) &&
          "Must not have a ratio for both incoming and op masks!");
 
   assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
   assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
   assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
   unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
   unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
 
   SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
 
   // Merge this shuffle operation's mask into our accumulated mask. Note that
   // this shuffle's mask will be the first applied to the input, followed by the
   // root mask to get us all the way to the root value arrangement. The reason
   // for this order is that we are recursing up the operation chain.
   for (unsigned i = 0; i < MaskWidth; ++i) {
     unsigned RootIdx = i >> RootRatioLog2;
     if (RootMask[RootIdx] < 0) {
       // This is a zero or undef lane, we're done.
       Mask[i] = RootMask[RootIdx];
       continue;
     }
 
     unsigned RootMaskedIdx =
         RootRatio == 1
             ? RootMask[RootIdx]
             : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
 
     // Just insert the scaled root mask value if it references an input other
     // than the SrcOp we're currently inserting.
     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
       Mask[i] = RootMaskedIdx;
       continue;
     }
 
     RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
     unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
     if (OpMask[OpIdx] < 0) {
       // The incoming lanes are zero or undef, it doesn't matter which ones we
       // are using.
       Mask[i] = OpMask[OpIdx];
       continue;
     }
 
     // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
     unsigned OpMaskedIdx =
         OpRatio == 1
             ? OpMask[OpIdx]
             : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
 
     OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
     if (OpMask[OpIdx] < (int)OpMask.size()) {
       assert(0 <= InputIdx0 && "Unknown target shuffle input");
       OpMaskedIdx += InputIdx0 * MaskWidth;
     } else {
       assert(0 <= InputIdx1 && "Unknown target shuffle input");
       OpMaskedIdx += InputIdx1 * MaskWidth;
     }
 
     Mask[i] = OpMaskedIdx;
   }
 
   // Handle the all undef/zero cases early.
   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
     return DAG.getUNDEF(Root.getValueType());
 
   // TODO - should we handle the mixed zero/undef case as well? Just returning
   // a zero mask will lose information on undef elements possibly reducing
   // future combine possibilities.
   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
                          SDLoc(Root));
 
   // Remove unused shuffle source ops.
   resolveTargetShuffleInputsAndMask(Ops, Mask);
   assert(!Ops.empty() && "Shuffle with no inputs detected");
 
   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
 
   // Update the list of shuffle nodes that have been combined so far.
   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
                                                 SrcNodes.end());
   CombinedNodes.push_back(Op.getNode());
 
   // See if we can recurse into each shuffle source op (if it's a target
   // shuffle). The source op should only be combined if it either has a
   // single use (i.e. current Op) or all its users have already been combined.
   for (int i = 0, e = Ops.size(); i < e; ++i)
     if (Ops[i].getNode()->hasOneUse() ||
         SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
       if (SDValue Res = combineX86ShufflesRecursively(
               Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
               DAG, DCI, Subtarget))
         return Res;
 
   // Attempt to constant fold all of the constant source ops.
   if (SDValue Cst = combineX86ShufflesConstants(
           Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
     return Cst;
 
   // We can only combine unary and binary shuffle mask cases.
   if (Ops.size() > 2)
     return SDValue();
 
   // Minor canonicalization of the accumulated shuffle mask to make it easier
   // to match below. All this does is detect masks with sequential pairs of
   // elements, and shrink them to the half-width mask. It does this in a loop
   // so it will reduce the size of the mask to the minimal width mask which
   // performs an equivalent shuffle.
   SmallVector<int, 64> WidenedMask;
   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
     Mask = std::move(WidenedMask);
   }
 
   // Canonicalization of binary shuffle masks to improve pattern matching by
   // commuting the inputs.
   if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
     ShuffleVectorSDNode::commuteMask(Mask);
     std::swap(Ops[0], Ops[1]);
   }
 
   // Finally, try to combine into a single shuffle instruction.
   return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
                                 DCI, Subtarget);
 }
 
 /// \brief Get the PSHUF-style mask from PSHUF node.
 ///
 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
 /// PSHUF-style masks that can be reused with such instructions.
 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   MVT VT = N.getSimpleValueType();
   SmallVector<int, 4> Mask;
   SmallVector<SDValue, 2> Ops;
   bool IsUnary;
   bool HaveMask =
       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
   (void)HaveMask;
   assert(HaveMask);
 
   // If we have more than 128-bits, only the low 128-bits of shuffle mask
   // matter. Check that the upper masks are repeats and remove them.
   if (VT.getSizeInBits() > 128) {
     int LaneElts = 128 / VT.getScalarSizeInBits();
 #ifndef NDEBUG
     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
       for (int j = 0; j < LaneElts; ++j)
         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
                "Mask doesn't repeat in high 128-bit lanes!");
 #endif
     Mask.resize(LaneElts);
   }
 
   switch (N.getOpcode()) {
   case X86ISD::PSHUFD:
     return Mask;
   case X86ISD::PSHUFLW:
     Mask.resize(4);
     return Mask;
   case X86ISD::PSHUFHW:
     Mask.erase(Mask.begin(), Mask.begin() + 4);
     for (int &M : Mask)
       M -= 4;
     return Mask;
   default:
     llvm_unreachable("No valid shuffle instruction found!");
   }
 }
 
 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
 ///
 /// We walk up the chain and look for a combinable shuffle, skipping over
 /// shuffles that we could hoist this shuffle's transformation past without
 /// altering anything.
 static SDValue
 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
                              SelectionDAG &DAG) {
   assert(N.getOpcode() == X86ISD::PSHUFD &&
          "Called with something other than an x86 128-bit half shuffle!");
   SDLoc DL(N);
 
   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
   // of the shuffles in the chain so that we can form a fresh chain to replace
   // this one.
   SmallVector<SDValue, 8> Chain;
   SDValue V = N.getOperand(0);
   for (; V.hasOneUse(); V = V.getOperand(0)) {
     switch (V.getOpcode()) {
     default:
       return SDValue(); // Nothing combined!
 
     case ISD::BITCAST:
       // Skip bitcasts as we always know the type for the target specific
       // instructions.
       continue;
 
     case X86ISD::PSHUFD:
       // Found another dword shuffle.
       break;
 
     case X86ISD::PSHUFLW:
       // Check that the low words (being shuffled) are the identity in the
       // dword shuffle, and the high words are self-contained.
       if (Mask[0] != 0 || Mask[1] != 1 ||
           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
         return SDValue();
 
       Chain.push_back(V);
       continue;
 
     case X86ISD::PSHUFHW:
       // Check that the high words (being shuffled) are the identity in the
       // dword shuffle, and the low words are self-contained.
       if (Mask[2] != 2 || Mask[3] != 3 ||
           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
         return SDValue();
 
       Chain.push_back(V);
       continue;
 
     case X86ISD::UNPCKL:
     case X86ISD::UNPCKH:
       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
       // shuffle into a preceding word shuffle.
       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
           V.getSimpleValueType().getVectorElementType() != MVT::i16)
         return SDValue();
 
       // Search for a half-shuffle which we can combine with.
       unsigned CombineOp =
           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
       if (V.getOperand(0) != V.getOperand(1) ||
           !V->isOnlyUserOf(V.getOperand(0).getNode()))
         return SDValue();
       Chain.push_back(V);
       V = V.getOperand(0);
       do {
         switch (V.getOpcode()) {
         default:
           return SDValue(); // Nothing to combine.
 
         case X86ISD::PSHUFLW:
         case X86ISD::PSHUFHW:
           if (V.getOpcode() == CombineOp)
             break;
 
           Chain.push_back(V);
 
           LLVM_FALLTHROUGH;
         case ISD::BITCAST:
           V = V.getOperand(0);
           continue;
         }
         break;
       } while (V.hasOneUse());
       break;
     }
     // Break out of the loop if we break out of the switch.
     break;
   }
 
   if (!V.hasOneUse())
     // We fell out of the loop without finding a viable combining instruction.
     return SDValue();
 
   // Merge this node's mask and our incoming mask.
   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   for (int &M : Mask)
     M = VMask[M];
   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
 
   // Rebuild the chain around this new shuffle.
   while (!Chain.empty()) {
     SDValue W = Chain.pop_back_val();
 
     if (V.getValueType() != W.getOperand(0).getValueType())
       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
 
     switch (W.getOpcode()) {
     default:
       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
 
     case X86ISD::UNPCKL:
     case X86ISD::UNPCKH:
       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
       break;
 
     case X86ISD::PSHUFD:
     case X86ISD::PSHUFLW:
     case X86ISD::PSHUFHW:
       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
       break;
     }
   }
   if (V.getValueType() != N.getValueType())
     V = DAG.getBitcast(N.getValueType(), V);
 
   // Return the new chain to replace N.
   return V;
 }
 
 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
 /// pshufhw.
 ///
 /// We walk up the chain, skipping shuffles of the other half and looking
 /// through shuffles which switch halves trying to find a shuffle of the same
 /// pair of dwords.
 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
                                         SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI) {
   assert(
       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
       "Called with something other than an x86 128-bit half shuffle!");
   SDLoc DL(N);
   unsigned CombineOpcode = N.getOpcode();
 
   // Walk up a single-use chain looking for a combinable shuffle.
   SDValue V = N.getOperand(0);
   for (; V.hasOneUse(); V = V.getOperand(0)) {
     switch (V.getOpcode()) {
     default:
       return false; // Nothing combined!
 
     case ISD::BITCAST:
       // Skip bitcasts as we always know the type for the target specific
       // instructions.
       continue;
 
     case X86ISD::PSHUFLW:
     case X86ISD::PSHUFHW:
       if (V.getOpcode() == CombineOpcode)
         break;
 
       // Other-half shuffles are no-ops.
       continue;
     }
     // Break out of the loop if we break out of the switch.
     break;
   }
 
   if (!V.hasOneUse())
     // We fell out of the loop without finding a viable combining instruction.
     return false;
 
   // Combine away the bottom node as its shuffle will be accumulated into
   // a preceding shuffle.
   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
 
   // Record the old value.
   SDValue Old = V;
 
   // Merge this node's mask and our incoming mask (adjusted to account for all
   // the pshufd instructions encountered).
   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   for (int &M : Mask)
     M = VMask[M];
   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
 
   // Check that the shuffles didn't cancel each other out. If not, we need to
   // combine to the new one.
   if (Old != V)
     // Replace the combinable shuffle with the combined one, updating all users
     // so that we re-evaluate the chain here.
     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
 
   return true;
 }
 
 /// \brief Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   MVT VT = N.getSimpleValueType();
   SmallVector<int, 4> Mask;
   unsigned Opcode = N.getOpcode();
 
   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
   // single instruction.
   if (VT.getScalarSizeInBits() == 64 &&
       (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
        Opcode == X86ISD::UNPCKL)) {
     auto BC0 = peekThroughBitcasts(N.getOperand(0));
     auto BC1 = peekThroughBitcasts(N.getOperand(1));
     EVT VT0 = BC0.getValueType();
     EVT VT1 = BC1.getValueType();
     unsigned Opcode0 = BC0.getOpcode();
     unsigned Opcode1 = BC1.getOpcode();
     if (Opcode0 == Opcode1 && VT0 == VT1 &&
         (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
          Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
          Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
       SDValue Lo, Hi;
       if (Opcode == X86ISD::MOVSD) {
         Lo = BC1.getOperand(0);
         Hi = BC0.getOperand(1);
       } else {
         Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
         Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
       }
       SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
       DCI.AddToWorklist(Horiz.getNode());
       return DAG.getBitcast(VT, Horiz);
     }
   }
 
   switch (Opcode) {
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFLW:
   case X86ISD::PSHUFHW:
     Mask = getPSHUFShuffleMask(N);
     assert(Mask.size() == 4);
     break;
   case X86ISD::UNPCKL: {
     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
     // moves upper half elements into the lower half part. For example:
     //
     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
     //     undef:v16i8
     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
     //
     // will be combined to:
     //
     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
 
     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
     // happen due to advanced instructions.
     if (!VT.is128BitVector())
       return SDValue();
 
     auto Op0 = N.getOperand(0);
     auto Op1 = N.getOperand(1);
     if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
 
       unsigned NumElts = VT.getVectorNumElements();
       SmallVector<int, 8> ExpectedMask(NumElts, -1);
       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
                 NumElts / 2);
 
       auto ShufOp = Op1.getOperand(0);
       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
     }
     return SDValue();
   }
   case X86ISD::BLENDI: {
     SDValue V0 = N->getOperand(0);
     SDValue V1 = N->getOperand(1);
     assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
            "Unexpected input vector types");
 
     // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
     // operands and changing the mask to 1. This saves us a bunch of
     // pattern-matching possibilities related to scalar math ops in SSE/AVX.
     // x86InstrInfo knows how to commute this back after instruction selection
     // if it would help register allocation.
 
     // TODO: If optimizing for size or a processor that doesn't suffer from
     // partial register update stalls, this should be transformed into a MOVSD
     // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
 
     if (VT == MVT::v2f64)
       if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
         if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
           SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
         }
 
     return SDValue();
   }
   case X86ISD::MOVSD:
   case X86ISD::MOVSS: {
     SDValue V0 = peekThroughBitcasts(N->getOperand(0));
     SDValue V1 = peekThroughBitcasts(N->getOperand(1));
     bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
     bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
     if (isZero0 && isZero1)
       return SDValue();
 
     // We often lower to MOVSD/MOVSS from integer as well as native float
     // types; remove unnecessary domain-crossing bitcasts if we can to make it
     // easier to combine shuffles later on. We've already accounted for the
     // domain switching cost when we decided to lower with it.
     bool isFloat = VT.isFloatingPoint();
     bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
     bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
     if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
       MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
                           : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
       V0 = DAG.getBitcast(NewVT, V0);
       V1 = DAG.getBitcast(NewVT, V1);
       return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
     }
 
     return SDValue();
   }
   case X86ISD::INSERTPS: {
     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
     SDValue Op0 = N.getOperand(0);
     SDValue Op1 = N.getOperand(1);
     SDValue Op2 = N.getOperand(2);
     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
     unsigned ZeroMask = InsertPSMask & 0xF;
 
     // If we zero out all elements from Op0 then we don't need to reference it.
     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
 
     // If we zero out the element from Op1 then we don't need to reference it.
     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
 
     // Attempt to merge insertps Op1 with an inner target shuffle node.
     SmallVector<int, 8> TargetMask1;
     SmallVector<SDValue, 2> Ops1;
     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
       int M = TargetMask1[SrcIdx];
       if (isUndefOrZero(M)) {
         // Zero/UNDEF insertion - zero out element and remove dependency.
         InsertPSMask |= (1u << DstIdx);
         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
       }
       // Update insertps mask srcidx and reference the source input directly.
       assert(0 <= M && M < 8 && "Shuffle index out of range");
       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
       Op1 = Ops1[M < 4 ? 0 : 1];
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
     }
 
     // Attempt to merge insertps Op0 with an inner target shuffle node.
     SmallVector<int, 8> TargetMask0;
     SmallVector<SDValue, 2> Ops0;
     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
       return SDValue();
 
     bool Updated = false;
     bool UseInput00 = false;
     bool UseInput01 = false;
     for (int i = 0; i != 4; ++i) {
       int M = TargetMask0[i];
       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
         // No change if element is already zero or the inserted element.
         continue;
       } else if (isUndefOrZero(M)) {
         // If the target mask is undef/zero then we must zero the element.
         InsertPSMask |= (1u << i);
         Updated = true;
         continue;
       }
 
       // The input vector element must be inline.
       if (M != i && M != (i + 4))
         return SDValue();
 
       // Determine which inputs of the target shuffle we're using.
       UseInput00 |= (0 <= M && M < 4);
       UseInput01 |= (4 <= M);
     }
 
     // If we're not using both inputs of the target shuffle then use the
     // referenced input directly.
     if (UseInput00 && !UseInput01) {
       Updated = true;
       Op0 = Ops0[0];
     } else if (!UseInput00 && UseInput01) {
       Updated = true;
       Op0 = Ops0[1];
     }
 
     if (Updated)
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
 
     return SDValue();
   }
   default:
     return SDValue();
   }
 
   // Nuke no-op shuffles that show up after combining.
   if (isNoopShuffleMask(Mask))
     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
 
   // Look for simplifications involving one or two shuffle instructions.
   SDValue V = N.getOperand(0);
   switch (N.getOpcode()) {
   default:
     break;
   case X86ISD::PSHUFLW:
   case X86ISD::PSHUFHW:
     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
 
     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
       return SDValue(); // We combined away this shuffle, so we're done.
 
     // See if this reduces to a PSHUFD which is no more expensive and can
     // combine with more operations. Note that it has to at least flip the
     // dwords as otherwise it would have been removed as a no-op.
     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
       int DMask[] = {0, 1, 2, 3};
       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
       DMask[DOffset + 0] = DOffset + 1;
       DMask[DOffset + 1] = DOffset + 0;
       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
       V = DAG.getBitcast(DVT, V);
       DCI.AddToWorklist(V.getNode());
       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
       DCI.AddToWorklist(V.getNode());
       return DAG.getBitcast(VT, V);
     }
 
     // Look for shuffle patterns which can be implemented as a single unpack.
     // FIXME: This doesn't handle the location of the PSHUFD generically, and
     // only works when we have a PSHUFD followed by two half-shuffles.
     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
         (V.getOpcode() == X86ISD::PSHUFLW ||
          V.getOpcode() == X86ISD::PSHUFHW) &&
         V.getOpcode() != N.getOpcode() &&
         V.hasOneUse()) {
       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
         int WordMask[8];
         for (int i = 0; i < 4; ++i) {
           WordMask[i + NOffset] = Mask[i] + NOffset;
           WordMask[i + VOffset] = VMask[i] + VOffset;
         }
         // Map the word mask through the DWord mask.
         int MappedMask[8];
         for (int i = 0; i < 8; ++i)
           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
           // We can replace all three shuffles with an unpack.
           V = DAG.getBitcast(VT, D.getOperand(0));
           DCI.AddToWorklist(V.getNode());
           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
                                                 : X86ISD::UNPCKH,
                              DL, VT, V, V);
         }
       }
     }
 
     break;
 
   case X86ISD::PSHUFD:
     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
       return NewN;
 
     break;
   }
 
   return SDValue();
 }
 
 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
 /// are written to the parameters \p Opnd0 and \p Opnd1.
 ///
 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
 /// so it is easier to generically match. We also insert dummy vector shuffle
 /// nodes for the operands which explicitly discard the lanes which are unused
 /// by this operation to try to flow through the rest of the combiner
 /// the fact that they're unused.
 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
                              SDValue &Opnd0, SDValue &Opnd1,
                              bool matchSubAdd = false) {
 
   EVT VT = N->getValueType(0);
   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
       (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
     return false;
 
   // We only handle target-independent shuffles.
   // FIXME: It would be easy and harmless to use the target shuffle mask
   // extraction tool to support more.
   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
     return false;
 
   ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
   SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
 
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
 
   unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
   unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;
 
   // We require the first shuffle operand to be the ExpectedOpcode node,
   // and the second to be the NextExpectedOpcode node.
   if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
     ShuffleVectorSDNode::commuteMask(Mask);
     std::swap(V1, V2);
   } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode)
     return false;
 
   // If there are other uses of these operations we can't fold them.
   if (!V1->hasOneUse() || !V2->hasOneUse())
     return false;
 
   // Ensure that both operations have the same operands. Note that we can
   // commute the FADD operands.
   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
     return false;
 
   // We're looking for blends between FADD and FSUB nodes. We insist on these
   // nodes being lined up in a specific expected pattern.
   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
         isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
                                            8, 25, 10, 27, 12, 29, 14, 31})))
     return false;
 
   Opnd0 = LHS;
   Opnd1 = RHS;
   return true;
 }
 
 /// \brief Try to combine a shuffle into a target-specific add-sub or
 /// mul-add-sub node.
 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
                                                 const X86Subtarget &Subtarget,
                                                 SelectionDAG &DAG) {
   SDValue Opnd0, Opnd1;
   if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
     return SDValue();
 
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
 
   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
   // the ADDSUB idiom has been successfully recognized. There are no known
   // X86 targets with 512-bit ADDSUB instructions!
   if (VT.is512BitVector())
     return SDValue();
 
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
 /// \brief Try to combine a shuffle into a target-specific
 /// mul-sub-add node.
 static SDValue combineShuffleToFMSubAdd(SDNode *N,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   SDValue Opnd0, Opnd1;
   if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
     return SDValue();
 
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   // Try to generate X86ISD::FMSUBADD node here.
   SDValue Opnd2;
   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
     return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);
 
   return SDValue();
 }
 
 // We are looking for a shuffle where both sources are concatenated with undef
 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
 // if we can express this as a single-source shuffle, that's preferable.
 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
                                            const X86Subtarget &Subtarget) {
   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
   if (!VT.is128BitVector() && !VT.is256BitVector())
     return SDValue();
 
   if (VT.getVectorElementType() != MVT::i32 &&
       VT.getVectorElementType() != MVT::i64 &&
       VT.getVectorElementType() != MVT::f32 &&
       VT.getVectorElementType() != MVT::f64)
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // Check that both sources are concats with undef.
   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
       !N1.getOperand(1).isUndef())
     return SDValue();
 
   // Construct the new shuffle mask. Elements from the first source retain their
   // index, but elements from the second source no longer need to skip an undef.
   SmallVector<int, 8> Mask;
   int NumElts = VT.getVectorNumElements();
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   for (int Elt : SVOp->getMask())
     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
 
   SDLoc DL(N);
   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
                                N1.getOperand(0));
   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
 }
 
 /// Eliminate a redundant shuffle of a horizontal math op.
 static SDValue foldShuffleOfHorizOp(SDNode *N) {
   if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
     return SDValue();
 
   SDValue HOp = N->getOperand(0);
   if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
       HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
     return SDValue();
 
   // 128-bit horizontal math instructions are defined to operate on adjacent
   // lanes of each operand as:
   // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
   // ...similarly for v2f64 and v8i16.
   // TODO: 256-bit is not the same because...x86.
   if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
     return SDValue();
 
   // When the operands of a horizontal math op are identical, the low half of
   // the result is the same as the high half. If the shuffle is also replicating
   // low and high halves, we don't need the shuffle.
   // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
   // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
   // but this should be tied to whatever horizontal op matching and shuffle
   // canonicalization are producing.
   if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
       isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
       isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
     return HOp;
 
   return SDValue();
 }
 
 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
   if (TLI.isTypeLegal(VT)) {
     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
       return AddSub;
 
     if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
       return FMSubAdd;
 
     if (SDValue HAddSub = foldShuffleOfHorizOp(N))
       return HAddSub;
   }
 
   // During Type Legalization, when promoting illegal vector types,
   // the backend might introduce new shuffle dag nodes and bitcasts.
   //
   // This code performs the following transformation:
   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
   //
   // We do this only if both the bitcast and the BINOP dag nodes have
   // one use. Also, perform this transformation only if the new binary
   // operation is legal. This is to avoid introducing dag nodes that
   // potentially need to be further expanded (or custom lowered) into a
   // less optimal sequence of dag nodes.
   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
       N->getOperand(0).getOpcode() == ISD::BITCAST &&
       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
 
     SDValue BC0 = N0.getOperand(0);
     EVT SVT = BC0.getValueType();
     unsigned Opcode = BC0.getOpcode();
     unsigned NumElts = VT.getVectorNumElements();
 
     if (BC0.hasOneUse() && SVT.isVector() &&
         SVT.getVectorNumElements() * 2 == NumElts &&
         TLI.isOperationLegal(Opcode, VT)) {
       bool CanFold = false;
       switch (Opcode) {
       default : break;
       case ISD::ADD:
       case ISD::SUB:
       case ISD::MUL:
         // isOperationLegal lies for integer ops on floating point types.
         CanFold = VT.isInteger();
         break;
       case ISD::FADD:
       case ISD::FSUB:
       case ISD::FMUL:
         // isOperationLegal lies for floating point ops on integer types.
         CanFold = VT.isFloatingPoint();
         break;
       }
 
       unsigned SVTNumElts = SVT.getVectorNumElements();
       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
         CanFold = SVOp->getMaskElt(i) < 0;
 
       if (CanFold) {
         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
       }
     }
   }
 
   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   // consecutive, non-overlapping, and in the right order.
   SmallVector<SDValue, 16> Elts;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
     if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
       Elts.push_back(Elt);
       continue;
     }
     Elts.clear();
     break;
   }
 
   if (Elts.size() == VT.getVectorNumElements())
     if (SDValue LD =
             EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
       return LD;
 
   // For AVX2, we sometimes want to combine
   // (vector_shuffle <mask> (concat_vectors t1, undef)
   //                        (concat_vectors t2, undef))
   // Into:
   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
     return ShufConcat;
 
   if (isTargetShuffle(N->getOpcode())) {
     SDValue Op(N, 0);
     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
       return Shuffle;
 
     // Try recursively combining arbitrary sequences of x86 shuffle
     // instructions into higher-order shuffles. We do this after combining
     // specific PSHUF instruction sequences into their minimal form so that we
     // can evaluate how many specialized shuffle instructions are involved in
     // a particular chain.
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
       DCI.CombineTo(N, Res);
       return SDValue();
     }
   }
 
   return SDValue();
 }
 
 /// Check if a vector extract from a target-specific shuffle of a load can be
 /// folded into a single element load.
 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
 /// shuffles have been custom lowered so we need to handle those here.
 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDValue InVec = N->getOperand(0);
   SDValue EltNo = N->getOperand(1);
   EVT EltVT = N->getValueType(0);
 
   if (!isa<ConstantSDNode>(EltNo))
     return SDValue();
 
   EVT OriginalVT = InVec.getValueType();
 
   // Peek through bitcasts, don't duplicate a load with other uses.
   InVec = peekThroughOneUseBitcasts(InVec);
 
   EVT CurrentVT = InVec.getValueType();
   if (!CurrentVT.isVector() ||
       CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
     return SDValue();
 
   if (!isTargetShuffle(InVec.getOpcode()))
     return SDValue();
 
   // Don't duplicate a load with other uses.
   if (!InVec.hasOneUse())
     return SDValue();
 
   SmallVector<int, 16> ShuffleMask;
   SmallVector<SDValue, 2> ShuffleOps;
   bool UnaryShuffle;
   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
                             ShuffleOps, ShuffleMask, UnaryShuffle))
     return SDValue();
 
   // Select the input vector, guarding against out of range extract vector.
   unsigned NumElems = CurrentVT.getVectorNumElements();
   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
 
   if (Idx == SM_SentinelZero)
     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
   if (Idx == SM_SentinelUndef)
     return DAG.getUNDEF(EltVT);
 
   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
                                          : ShuffleOps[1];
 
   // If inputs to shuffle are the same for both ops, then allow 2 uses
   unsigned AllowedUses =
       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
 
   if (LdNode.getOpcode() == ISD::BITCAST) {
     // Don't duplicate a load with other uses.
     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
       return SDValue();
 
     AllowedUses = 1; // only allow 1 load use if we have a bitcast
     LdNode = LdNode.getOperand(0);
   }
 
   if (!ISD::isNormalLoad(LdNode.getNode()))
     return SDValue();
 
   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
 
   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
     return SDValue();
 
   // If there's a bitcast before the shuffle, check if the load type and
   // alignment is valid.
   unsigned Align = LN0->getAlignment();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
       EltVT.getTypeForEVT(*DAG.getContext()));
 
   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
     return SDValue();
 
   // All checks match so transform back to vector_shuffle so that DAG combiner
   // can finish the job
   SDLoc dl(N);
 
   // Create shuffle node taking into account the case that its a unary shuffle
   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
                                  ShuffleMask);
   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
                      EltNo);
 }
 
 // Try to match patterns such as
 // (i16 bitcast (v16i1 x))
 // ->
 // (i16 movmsk (16i8 sext (v16i1 x)))
 // before the illegal vector is scalarized on subtargets that don't have legal
 // vxi1 types.
 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
                                   const X86Subtarget &Subtarget) {
   EVT VT = BitCast.getValueType();
   SDValue N0 = BitCast.getOperand(0);
   EVT VecVT = N0->getValueType(0);
 
   if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
       N0->getOpcode() == ISD::OR) {
     SDValue Op0 = N0->getOperand(0);
     SDValue Op1 = N0->getOperand(1);
     MVT TrunckVT;
     MVT BitcastVT;
     switch (VT.getSimpleVT().SimpleTy) {
     default:
       return SDValue();
     case MVT::v16i1:
       TrunckVT = MVT::i8;
       BitcastVT = MVT::v8i1;
       break;
     case MVT::v32i1:
       TrunckVT = MVT::i16;
       BitcastVT = MVT::v16i1;
       break;
     case MVT::v64i1:
       TrunckVT = MVT::i32;
       BitcastVT = MVT::v32i1;
       break;
     }
     bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
     bool isArg0UndefLeft =
         Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
     bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
     bool isArg1UndefLeft =
         Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
     SDValue OpLeft;
     SDValue OpRight;
     if (isArg0UndefRight && isArg1UndefLeft) {
       OpLeft = Op0;
       OpRight = Op1;
     } else if (isArg1UndefRight && isArg0UndefLeft) {
       OpLeft = Op1;
       OpRight = Op0;
     } else
       return SDValue();
     SDLoc DL(BitCast);
     SDValue Shr = OpLeft->getOperand(0);
     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
     SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
     SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
     SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
   }
 
   if (!VT.isScalarInteger() || !VecVT.isSimple())
     return SDValue();
 
   // With AVX512 vxi1 types are legal and we prefer using k-regs.
   // MOVMSK is supported in SSE2 or later.
   if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
     return SDValue();
 
   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
   // v8i16 and v16i16.
   // For these two cases, we can shuffle the upper element bytes to a
   // consecutive sequence at the start of the vector and treat the results as
   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
   // for v16i16 this is not the case, because the shuffle is expensive, so we
   // avoid sign-extending to this type entirely.
   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
   MVT SExtVT;
   MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
   switch (VecVT.getSimpleVT().SimpleTy) {
   default:
     return SDValue();
   case MVT::v2i1:
     SExtVT = MVT::v2i64;
     FPCastVT = MVT::v2f64;
     break;
   case MVT::v4i1:
     SExtVT = MVT::v4i32;
     FPCastVT = MVT::v4f32;
     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
     // sign-extend to a 256-bit operation to avoid truncation.
     if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
         N0->getOperand(0).getValueType().is256BitVector()) {
       SExtVT = MVT::v4i64;
       FPCastVT = MVT::v4f64;
     }
     break;
   case MVT::v8i1:
     SExtVT = MVT::v8i16;
     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
     // sign-extend to a 256-bit operation to match the compare.
     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
     // 256-bit because the shuffle is cheaper than sign extending the result of
     // the compare.
     if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
         (N0->getOperand(0).getValueType().is256BitVector() ||
          N0->getOperand(0).getValueType().is512BitVector())) {
       SExtVT = MVT::v8i32;
       FPCastVT = MVT::v8f32;
     }
     break;
   case MVT::v16i1:
     SExtVT = MVT::v16i8;
     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
     // it is not profitable to sign-extend to 256-bit because this will
     // require an extra cross-lane shuffle which is more expensive than
     // truncating the result of the compare to 128-bits.
     break;
   case MVT::v32i1:
     SExtVT = MVT::v32i8;
     break;
   };
 
   SDLoc DL(BitCast);
   SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
 
   if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
     // Handle pre-AVX2 cases by splitting to two v16i1's.
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
     SDValue Lo = extract128BitVector(V, 0, DAG, DL);
     SDValue Hi = extract128BitVector(V, 16, DAG, DL);
     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
                      DAG.getConstant(16, DL, ShiftTy));
     V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
     return DAG.getZExtOrTrunc(V, DL, VT);
   }
 
   if (SExtVT == MVT::v8i16) {
     assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
     V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
                     DAG.getUNDEF(MVT::v8i16));
   } else
     assert(SExtVT.getScalarType() != MVT::i16 &&
            "Vectors of i16 must be packed");
   if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
     V = DAG.getBitcast(FPCastVT, V);
   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
   return DAG.getZExtOrTrunc(V, DL, VT);
 }
 
 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SrcVT = N0.getValueType();
 
   // Try to match patterns such as
   // (i16 bitcast (v16i1 x))
   // ->
   // (i16 movmsk (16i8 sext (v16i1 x)))
   // before the setcc result is scalarized on subtargets that don't have legal
   // vxi1 types.
   if (DCI.isBeforeLegalize()) {
     if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
       return V;
 
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
         Subtarget.hasVLX()) {
       SDLoc dl(N);
       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
       N0 = DAG.getBitcast(MVT::v8i1, N0);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
                          DAG.getIntPtrConstant(0, dl));
     }
 
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
         Subtarget.hasVLX()) {
       SDLoc dl(N);
       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
       Ops[0] = N0;
       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
       N0 = DAG.getBitcast(MVT::i8, N0);
       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
     }
   }
 
   // Since MMX types are special and don't usually play with other vector types,
   // it's better to handle them early to be sure we emit efficient code by
   // avoiding store-load conversions.
 
   // Detect bitcasts between i32 to x86mmx low word.
   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
       SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
     SDValue N00 = N0->getOperand(0);
     if (N00.getValueType() == MVT::i32)
       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
   }
 
   // Detect bitcasts between element or subvector extraction to x86mmx.
   if (VT == MVT::x86mmx &&
       (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
        N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
       isNullConstant(N0.getOperand(1))) {
     SDValue N00 = N0->getOperand(0);
     if (N00.getValueType().is128BitVector())
       return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
                          DAG.getBitcast(MVT::v2i64, N00));
   }
 
   // Detect bitcasts from FP_TO_SINT to x86mmx.
   if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
       N0.getOpcode() == ISD::FP_TO_SINT) {
     SDLoc DL(N0);
     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
                               DAG.getUNDEF(MVT::v2i32));
     return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
                        DAG.getBitcast(MVT::v2i64, Res));
   }
 
   // Convert a bitcasted integer logic operation that has one bitcasted
   // floating-point operand into a floating-point logic operation. This may
   // create a load of a constant, but that is cheaper than materializing the
   // constant in an integer register and transferring it to an SSE register or
   // transferring the SSE operand to integer register and back.
   unsigned FPOpcode;
   switch (N0.getOpcode()) {
     case ISD::AND: FPOpcode = X86ISD::FAND; break;
     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
     default: return SDValue();
   }
 
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64)))
     return SDValue();
 
   SDValue LogicOp0 = N0.getOperand(0);
   SDValue LogicOp1 = N0.getOperand(1);
   SDLoc DL0(N0);
 
   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
     return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
   }
   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
     return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
   }
 
   return SDValue();
 }
 
 // Match a binop + shuffle pyramid that represents a horizontal reduction over
 // the elements of a vector.
 // Returns the vector that is being reduced on, or SDValue() if a reduction
 // was not matched.
 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
                                    ArrayRef<ISD::NodeType> CandidateBinOps) {
   // The pattern must end in an extract from index 0.
   if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
       !isNullConstant(Extract->getOperand(1)))
     return SDValue();
 
   SDValue Op = Extract->getOperand(0);
   unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
 
   // Match against one of the candidate binary ops.
   if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
         return Op.getOpcode() == unsigned(BinOp);
       }))
     return SDValue();
 
   // At each stage, we're looking for something that looks like:
   // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
   //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
   //                               i32 undef, i32 undef, i32 undef, i32 undef>
   // %a = binop <8 x i32> %op, %s
   // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
   // we expect something like:
   // <4,5,6,7,u,u,u,u>
   // <2,3,u,u,u,u,u,u>
   // <1,u,u,u,u,u,u,u>
   unsigned CandidateBinOp = Op.getOpcode();
   for (unsigned i = 0; i < Stages; ++i) {
     if (Op.getOpcode() != CandidateBinOp)
       return SDValue();
 
     ShuffleVectorSDNode *Shuffle =
         dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
     if (Shuffle) {
       Op = Op.getOperand(1);
     } else {
       Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
       Op = Op.getOperand(0);
     }
 
     // The first operand of the shuffle should be the same as the other operand
     // of the binop.
     if (!Shuffle || Shuffle->getOperand(0) != Op)
       return SDValue();
 
     // Verify the shuffle has the expected (at this stage of the pyramid) mask.
     for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
       if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
         return SDValue();
   }
 
   BinOp = CandidateBinOp;
   return Op;
 }
 
 // Given a select, detect the following pattern:
 // 1:    %2 = zext <N x i8> %0 to <N x i32>
 // 2:    %3 = zext <N x i8> %1 to <N x i32>
 // 3:    %4 = sub nsw <N x i32> %2, %3
 // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
 // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
 // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
 // This is useful as it is the input into a SAD pattern.
 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
                               SDValue &Op1) {
   // Check the condition of the select instruction is greater-than.
   SDValue SetCC = Select->getOperand(0);
   if (SetCC.getOpcode() != ISD::SETCC)
     return false;
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   if (CC != ISD::SETGT && CC != ISD::SETLT)
     return false;
 
   SDValue SelectOp1 = Select->getOperand(1);
   SDValue SelectOp2 = Select->getOperand(2);
 
   // The following instructions assume SelectOp1 is the subtraction operand
   // and SelectOp2 is the negation operand.
   // In the case of SETLT this is the other way around.
   if (CC == ISD::SETLT)
     std::swap(SelectOp1, SelectOp2);
 
   // The second operand of the select should be the negation of the first
   // operand, which is implemented as 0 - SelectOp1.
   if (!(SelectOp2.getOpcode() == ISD::SUB &&
         ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
         SelectOp2.getOperand(1) == SelectOp1))
     return false;
 
   // The first operand of SetCC is the first operand of the select, which is the
   // difference between the two input vectors.
   if (SetCC.getOperand(0) != SelectOp1)
     return false;
 
   // In SetLT case, The second operand of the comparison can be either 1 or 0.
   APInt SplatVal;
   if ((CC == ISD::SETLT) &&
       !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
          SplatVal.isOneValue()) ||
         (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
     return false;
 
   // In SetGT case, The second operand of the comparison can be either -1 or 0.
   if ((CC == ISD::SETGT) &&
       !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
     return false;
 
   // The first operand of the select is the difference between the two input
   // vectors.
   if (SelectOp1.getOpcode() != ISD::SUB)
     return false;
 
   Op0 = SelectOp1.getOperand(0);
   Op1 = SelectOp1.getOperand(1);
 
   // Check if the operands of the sub are zero-extended from vectors of i8.
   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
       Op1.getOpcode() != ISD::ZERO_EXTEND ||
       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
     return false;
 
   return true;
 }
 
 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
 // to these zexts.
 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
                             const SDValue &Zext1, const SDLoc &DL) {
 
   // Find the appropriate width for the PSADBW.
   EVT InVT = Zext0.getOperand(0).getValueType();
   unsigned RegSize = std::max(128u, InVT.getSizeInBits());
 
   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
   // fill in the missing vector elements with 0.
   unsigned NumConcat = RegSize / InVT.getSizeInBits();
   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
   Ops[0] = Zext0.getOperand(0);
   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
   Ops[0] = Zext1.getOperand(0);
   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
 
   // Actually build the SAD
   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
 }
 
 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
 // PHMINPOSUW.
 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
                                              const X86Subtarget &Subtarget) {
   // Bail without SSE41.
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   EVT ExtractVT = Extract->getValueType(0);
   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
     return SDValue();
 
   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
   unsigned BinOp;
   SDValue Src = matchBinOpReduction(
       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
   if (!Src)
     return SDValue();
 
   EVT SrcVT = Src.getValueType();
   EVT SrcSVT = SrcVT.getScalarType();
   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
     return SDValue();
 
   SDLoc DL(Extract);
   SDValue MinPos = Src;
 
   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
   while (SrcVT.getSizeInBits() > 128) {
     unsigned NumElts = SrcVT.getVectorNumElements();
     unsigned NumSubElts = NumElts / 2;
     SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
     unsigned SubSizeInBits = SrcVT.getSizeInBits();
     SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
     SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
   }
   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
          "Unexpected value type");
 
   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
   // to flip the value accordingly.
   SDValue Mask;
   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
   if (BinOp == ISD::SMAX)
     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::SMIN)
     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::UMAX)
     Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
 
   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
   // shuffling each upper element down and insert zeros. This means that the
   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
   // ready for the PHMINPOS.
   if (ExtractVT == MVT::i8) {
     SDValue Upper = DAG.getVectorShuffle(
         SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
   }
 
   // Perform the PHMINPOS on a v8i16 vector,
   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
   MinPos = DAG.getBitcast(SrcVT, MinPos);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
                      DAG.getIntPtrConstant(0, DL));
 }
 
 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
                                                 SelectionDAG &DAG,
                                                 const X86Subtarget &Subtarget) {
   // Bail without SSE2 or with AVX512VL (which uses predicate registers).
   if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
     return SDValue();
 
   EVT ExtractVT = Extract->getValueType(0);
   unsigned BitWidth = ExtractVT.getSizeInBits();
   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
       ExtractVT != MVT::i8)
     return SDValue();
 
   // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
   unsigned BinOp = 0;
   SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
   if (!Match)
     return SDValue();
 
   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
   // which we can't support here for now.
   if (Match.getScalarValueSizeInBits() != BitWidth)
     return SDValue();
 
   // We require AVX2 for PMOVMSKB for v16i16/v32i8;
   unsigned MatchSizeInBits = Match.getValueSizeInBits();
   if (!(MatchSizeInBits == 128 ||
         (MatchSizeInBits == 256 &&
          ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
     return SDValue();
 
   // Don't bother performing this for 2-element vectors.
   if (Match.getValueType().getVectorNumElements() <= 2)
     return SDValue();
 
   // Check that we are extracting a reduction of all sign bits.
   if (DAG.ComputeNumSignBits(Match) != BitWidth)
     return SDValue();
 
   // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
   MVT MaskVT;
   if (64 == BitWidth || 32 == BitWidth)
     MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
                               MatchSizeInBits / BitWidth);
   else
     MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
 
   APInt CompareBits;
   ISD::CondCode CondCode;
   if (BinOp == ISD::OR) {
     // any_of -> MOVMSK != 0
     CompareBits = APInt::getNullValue(32);
     CondCode = ISD::CondCode::SETNE;
   } else {
     // all_of -> MOVMSK == ((1 << NumElts) - 1)
     CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
     CondCode = ISD::CondCode::SETEQ;
   }
 
   // Perform the select as i32/i64 and then truncate to avoid partial register
   // stalls.
   unsigned ResWidth = std::max(BitWidth, 32u);
   EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
   SDLoc DL(Extract);
   SDValue Zero = DAG.getConstant(0, DL, ResVT);
   SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
   SDValue Res = DAG.getBitcast(MaskVT, Match);
   Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
   Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
                         Ones, Zero, CondCode);
   return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
 }
 
 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   // PSADBW is only supported on SSE2 and up.
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   // Verify the type we're extracting from is any integer type above i16.
   EVT VT = Extract->getOperand(0).getValueType();
   if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
     return SDValue();
 
   unsigned RegSize = 128;
   if (Subtarget.hasBWI())
     RegSize = 512;
   else if (Subtarget.hasAVX2())
     RegSize = 256;
 
   // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
   if (RegSize / VT.getVectorNumElements() < 8)
     return SDValue();
 
   // Match shuffle + add pyramid.
   unsigned BinOp = 0;
   SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
 
   // The operand is expected to be zero extended from i8
   // (verified in detectZextAbsDiff).
   // In order to convert to i64 and above, additional any/zero/sign
   // extend is expected.
   // The zero extend from 32 bit has no mathematical effect on the result.
   // Also the sign extend is basically zero extend
   // (extends the sign bit which is zero).
   // So it is correct to skip the sign/zero extend instruction.
   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
     Root.getOpcode() == ISD::ZERO_EXTEND ||
     Root.getOpcode() == ISD::ANY_EXTEND))
     Root = Root.getOperand(0);
 
   // If there was a match, we want Root to be a select that is the root of an
   // abs-diff pattern.
   if (!Root || (Root.getOpcode() != ISD::VSELECT))
     return SDValue();
 
   // Check whether we have an abs-diff pattern feeding into the select.
   SDValue Zext0, Zext1;
   if (!detectZextAbsDiff(Root, Zext0, Zext1))
     return SDValue();
 
   // Create the SAD instruction.
   SDLoc DL(Extract);
   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
 
   // If the original vector was wider than 8 elements, sum over the results
   // in the SAD vector.
   unsigned Stages = Log2_32(VT.getVectorNumElements());
   MVT SadVT = SAD.getSimpleValueType();
   if (Stages > 3) {
     unsigned SadElems = SadVT.getVectorNumElements();
 
     for(unsigned i = Stages - 3; i > 0; --i) {
       SmallVector<int, 16> Mask(SadElems, -1);
       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
         Mask[j] = MaskEnd + j;
 
       SDValue Shuffle =
           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
     }
   }
 
   MVT Type = Extract->getSimpleValueType(0);
   unsigned TypeSizeInBits = Type.getSizeInBits();
   // Return the lowest TypeSizeInBits bits.
   MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
   SAD = DAG.getBitcast(ResVT, SAD);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
                      Extract->getOperand(1));
 }
 
 // Attempt to peek through a target shuffle and extract the scalar from the
 // source.
 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI,
                                          const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDValue Src = N->getOperand(0);
   SDValue Idx = N->getOperand(1);
 
   EVT VT = N->getValueType(0);
   EVT SrcVT = Src.getValueType();
   EVT SrcSVT = SrcVT.getVectorElementType();
   unsigned NumSrcElts = SrcVT.getVectorNumElements();
 
   // Don't attempt this for boolean mask vectors or unknown extraction indices.
   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
     return SDValue();
 
   // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
   if (X86ISD::VBROADCAST == Src.getOpcode() &&
       Src.getOperand(0).getValueType() == VT)
     return Src.getOperand(0);
 
   // Resolve the target shuffle inputs and mask.
   SmallVector<int, 16> Mask;
   SmallVector<SDValue, 2> Ops;
   if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
     return SDValue();
 
   // Attempt to narrow/widen the shuffle mask to the correct size.
   if (Mask.size() != NumSrcElts) {
     if ((NumSrcElts % Mask.size()) == 0) {
       SmallVector<int, 16> ScaledMask;
       int Scale = NumSrcElts / Mask.size();
       scaleShuffleMask<int>(Scale, Mask, ScaledMask);
       Mask = std::move(ScaledMask);
     } else if ((Mask.size() % NumSrcElts) == 0) {
       SmallVector<int, 16> WidenedMask;
       while (Mask.size() > NumSrcElts &&
              canWidenShuffleElements(Mask, WidenedMask))
         Mask = std::move(WidenedMask);
       // TODO - investigate support for wider shuffle masks with known upper
       // undef/zero elements for implicit zero-extension.
     }
   }
 
   // Check if narrowing/widening failed.
   if (Mask.size() != NumSrcElts)
     return SDValue();
 
   int SrcIdx = Mask[N->getConstantOperandVal(1)];
   SDLoc dl(N);
 
   // If the shuffle source element is undef/zero then we can just accept it.
   if (SrcIdx == SM_SentinelUndef)
     return DAG.getUNDEF(VT);
 
   if (SrcIdx == SM_SentinelZero)
     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
                                 : DAG.getConstant(0, dl, VT);
 
   SDValue SrcOp = Ops[SrcIdx / Mask.size()];
   SrcOp = DAG.getBitcast(SrcVT, SrcOp);
   SrcIdx = SrcIdx % Mask.size();
 
   // We can only extract other elements from 128-bit vectors and in certain
   // circumstances, depending on SSE-level.
   // TODO: Investigate using extract_subvector for larger vectors.
   // TODO: Investigate float/double extraction if it will be just stored.
   if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
       ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
     assert(SrcSVT == VT && "Unexpected extraction type");
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
                        DAG.getIntPtrConstant(SrcIdx, dl));
   }
 
   if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
       (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
     assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
            "Unexpected extraction type");
     unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
     SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
                                 DAG.getIntPtrConstant(SrcIdx, dl));
     return DAG.getZExtOrTrunc(ExtOp, dl, VT);
   }
 
   return SDValue();
 }
 
 /// Detect vector gather/scatter index generation and convert it from being a
 /// bunch of shuffles and extracts into a somewhat faster sequence.
 /// For i686, the best sequence is apparently storing the value and loading
 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const X86Subtarget &Subtarget) {
   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
     return NewOp;
 
   // TODO - Remove this once we can handle the implicit zero-extension of
   // X86ISD::PEXTRW/X86ISD::PEXTRB in:
   // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
   // combineBasicSADPattern.
   if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return SDValue();
 
   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
     return NewOp;
 
   SDValue InputVector = N->getOperand(0);
   SDValue EltIdx = N->getOperand(1);
 
   EVT SrcVT = InputVector.getValueType();
   EVT VT = N->getValueType(0);
   SDLoc dl(InputVector);
 
   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
     SDValue MMXSrc = InputVector.getOperand(0);
 
     // The bitcast source is a direct mmx result.
     if (MMXSrc.getValueType() == MVT::x86mmx)
       return DAG.getBitcast(VT, InputVector);
   }
 
   // Detect mmx to i32 conversion through a v2i32 elt extract.
   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
     SDValue MMXSrc = InputVector.getOperand(0);
 
     // The bitcast source is a direct mmx result.
     if (MMXSrc.getValueType() == MVT::x86mmx)
       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
   }
 
   if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
       isa<ConstantSDNode>(EltIdx) &&
       isa<ConstantSDNode>(InputVector.getOperand(0))) {
     uint64_t ExtractedElt = N->getConstantOperandVal(1);
     uint64_t InputValue = InputVector.getConstantOperandVal(0);
     uint64_t Res = (InputValue >> ExtractedElt) & 1;
     return DAG.getConstant(Res, dl, MVT::i1);
   }
 
   // Check whether this extract is the root of a sum of absolute differences
   // pattern. This has to be done here because we really want it to happen
   // pre-legalization,
   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
     return SAD;
 
   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
   if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
     return Cmp;
 
   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
   if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
     return MinMax;
 
   // Only operate on vectors of 4 elements, where the alternative shuffling
   // gets to be more expensive.
   if (SrcVT != MVT::v4i32)
     return SDValue();
 
   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
   // single use which is a sign-extend or zero-extend, and all elements are
   // used.
   SmallVector<SDNode *, 4> Uses;
   unsigned ExtractedElements = 0;
   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
     if (UI.getUse().getResNo() != InputVector.getResNo())
       return SDValue();
 
     SDNode *Extract = *UI;
     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
 
     if (Extract->getValueType(0) != MVT::i32)
       return SDValue();
     if (!Extract->hasOneUse())
       return SDValue();
     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
       return SDValue();
     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
       return SDValue();
 
     // Record which element was extracted.
     ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
     Uses.push_back(Extract);
   }
 
   // If not all the elements were used, this may not be worthwhile.
   if (ExtractedElements != 15)
     return SDValue();
 
   // Ok, we've now decided to do the transformation.
   // If 64-bit shifts are legal, use the extract-shift sequence,
   // otherwise bounce the vector off the cache.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Vals[4];
 
   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
     auto &DL = DAG.getDataLayout();
     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
       DAG.getConstant(0, dl, VecIdxTy));
     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
       DAG.getConstant(1, dl, VecIdxTy));
 
     SDValue ShAmt = DAG.getConstant(
         32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
   } else {
     // Store the value to a temporary stack slot.
     SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
                               MachinePointerInfo());
 
     EVT ElementType = SrcVT.getVectorElementType();
     unsigned EltSize = ElementType.getSizeInBits() / 8;
 
     // Replace each use (extract) with a load of the appropriate element.
     for (unsigned i = 0; i < 4; ++i) {
       uint64_t Offset = EltSize * i;
       auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
       SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
 
       SDValue ScalarAddr =
           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
 
       // Load the scalar.
       Vals[i] =
           DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
     }
   }
 
   // Replace the extracts
   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
     UE = Uses.end(); UI != UE; ++UI) {
     SDNode *Extract = *UI;
 
     uint64_t IdxVal = Extract->getConstantOperandVal(1);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
   }
 
   // The replacement was made in place; return N so it won't be revisited.
   return SDValue(N, 0);
 }
 
 /// If a vector select has an operand that is -1 or 0, try to simplify the
 /// select to a bitwise logic operation.
 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
 static SDValue
 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   SDValue Cond = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   EVT VT = LHS.getValueType();
   EVT CondVT = Cond.getValueType();
   SDLoc DL(N);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (N->getOpcode() != ISD::VSELECT)
     return SDValue();
 
   assert(CondVT.isVector() && "Vector select expects a vector selector!");
 
   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
   // Check if the first operand is all zeros and Cond type is vXi1.
   // This situation only applies to avx512.
   if (TValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
       CondVT.getVectorElementType() == MVT::i1) {
     // Invert the cond to not(cond) : xor(op,allones)=not(op)
     SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
                                   DAG.getAllOnesConstant(DL, CondVT));
     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
   }
 
   // To use the condition operand as a bitwise mask, it must have elements that
   // are the same size as the select elements. Ie, the condition operand must
   // have already been promoted from the IR select condition type <N x i1>.
   // Don't check if the types themselves are equal because that excludes
   // vector floating-point selects.
   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
     return SDValue();
 
   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
 
   // Try to invert the condition if true value is not all 1s and false value is
   // not all 0s.
   if (!TValIsAllOnes && !FValIsAllZeros &&
       // Check if the selector will be produced by CMPP*/PCMP*.
       Cond.getOpcode() == ISD::SETCC &&
       // Check if SETCC has already been promoted.
       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
           CondVT) {
     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
 
     if (TValIsAllZeros || FValIsAllOnes) {
       SDValue CC = Cond.getOperand(2);
       ISD::CondCode NewCC =
           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                Cond.getOperand(0).getValueType().isInteger());
       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
                           NewCC);
       std::swap(LHS, RHS);
       TValIsAllOnes = FValIsAllOnes;
       FValIsAllZeros = TValIsAllZeros;
     }
   }
 
   // Cond value must be 'sign splat' to be converted to a logical op.
   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
     return SDValue();
 
   // vselect Cond, 111..., 000... -> Cond
   if (TValIsAllOnes && FValIsAllZeros)
     return DAG.getBitcast(VT, Cond);
 
   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
     return SDValue();
 
   // vselect Cond, 111..., X -> or Cond, X
   if (TValIsAllOnes) {
     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
     return DAG.getBitcast(VT, Or);
   }
 
   // vselect Cond, X, 000... -> and Cond, X
   if (FValIsAllZeros) {
     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
     return DAG.getBitcast(VT, And);
   }
 
   // vselect Cond, 000..., X -> andn Cond, X
   if (TValIsAllZeros) {
     MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
     SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
     SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
     SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
     return DAG.getBitcast(VT, AndN);
   }
 
   return SDValue();
 }
 
 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
   SDValue Cond = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   SDLoc DL(N);
 
   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
   if (!TrueC || !FalseC)
     return SDValue();
 
   // Don't do this for crazy integer types.
   EVT VT = N->getValueType(0);
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   // We're going to use the condition bit in math or logic ops. We could allow
   // this with a wider condition value (post-legalization it becomes an i8),
   // but if nothing is creating selects that late, it doesn't matter.
   if (Cond.getValueType() != MVT::i1)
     return SDValue();
 
   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
   // 3, 5, or 9 with i32/i64, so those get transformed too.
   // TODO: For constants that overflow or do not differ by power-of-2 or small
   // multiplier, convert to 'and' + 'add'.
   const APInt &TrueVal = TrueC->getAPIntValue();
   const APInt &FalseVal = FalseC->getAPIntValue();
   bool OV;
   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
   if (OV)
     return SDValue();
 
   APInt AbsDiff = Diff.abs();
   if (AbsDiff.isPowerOf2() ||
       ((VT == MVT::i32 || VT == MVT::i64) &&
        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
 
     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
     // of the condition can usually be folded into a compare predicate, but even
     // without that, the sequence should be cheaper than a CMOV alternative.
     if (TrueVal.slt(FalseVal)) {
       Cond = DAG.getNOT(DL, Cond, MVT::i1);
       std::swap(TrueC, FalseC);
     }
 
     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
 
     // Multiply condition by the difference if non-one.
     if (!AbsDiff.isOneValue())
       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
 
     // Add the base if non-zero.
     if (!FalseC->isNullValue())
       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
 
     return R;
   }
 
   return SDValue();
 }
 
 // If this is a bitcasted op that can be represented as another type, push the
 // the bitcast to the inputs. This allows more opportunities for pattern
 // matching masked instructions. This is called when we know that the operation
 // is used as one of the inputs of a vselect.
 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI) {
   // Make sure we have a bitcast.
   if (OrigOp.getOpcode() != ISD::BITCAST)
     return false;
 
   SDValue Op = OrigOp.getOperand(0);
 
   // If the operation is used by anything other than the bitcast, we shouldn't
   // do this combine as that would replicate the operation.
   if (!Op.hasOneUse())
     return false;
 
   MVT VT = OrigOp.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   SDLoc DL(Op.getNode());
 
   auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
                                       SDValue Op2) {
     Op0 = DAG.getBitcast(VT, Op0);
     DCI.AddToWorklist(Op0.getNode());
     Op1 = DAG.getBitcast(VT, Op1);
     DCI.AddToWorklist(Op1.getNode());
     DCI.CombineTo(OrigOp.getNode(),
                   DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
     return true;
   };
 
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
   case X86ISD::SHUF128: {
     if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
       return false;
     // Only change element size, not type.
     if (VT.isInteger() != Op.getSimpleValueType().isInteger())
       return false;
     return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
                                     Op.getOperand(2));
   }
   case X86ISD::SUBV_BROADCAST: {
     unsigned EltSize = EltVT.getSizeInBits();
     if (EltSize != 32 && EltSize != 64)
       return false;
     // Only change element size, not type.
     if (VT.isInteger() != Op.getSimpleValueType().isInteger())
       return false;
     SDValue Op0 = Op.getOperand(0);
     MVT Op0VT = MVT::getVectorVT(EltVT,
                             Op0.getSimpleValueType().getSizeInBits() / EltSize);
     Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
     DCI.AddToWorklist(Op0.getNode());
     DCI.CombineTo(OrigOp.getNode(),
                   DAG.getNode(Opcode, DL, VT, Op0));
     return true;
   }
   }
 
   return false;
 }
 
 /// Do target-specific dag combines on SELECT and VSELECT nodes.
 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue Cond = N->getOperand(0);
   // Get the LHS/RHS of the select.
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   EVT VT = LHS.getValueType();
   EVT CondVT = Cond.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
       VT != MVT::f80 && VT != MVT::f128 &&
       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget.hasSSE2() ||
        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     unsigned Opcode = 0;
     // Check for x CC y ? x : y.
     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
       switch (CC) {
       default: break;
       case ISD::SETULT:
         // Converting this to a min would handle NaNs incorrectly, and swapping
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
           if (!DAG.getTarget().Options.UnsafeFPMath &&
               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
           std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETOLE:
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
           break;
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETULE:
         // Converting this to a min would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
         LLVM_FALLTHROUGH;
       case ISD::SETOLT:
       case ISD::SETLT:
       case ISD::SETLE:
         Opcode = X86ISD::FMIN;
         break;
 
       case ISD::SETOGE:
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
           break;
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETUGT:
         // Converting this to a max would handle NaNs incorrectly, and swapping
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
           if (!DAG.getTarget().Options.UnsafeFPMath &&
               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
             break;
           std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETUGE:
         // Converting this to a max would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
         LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETGT:
       case ISD::SETGE:
         Opcode = X86ISD::FMAX;
         break;
       }
     // Check for x CC y ? y : x -- a min/max with reversed arms.
     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
       switch (CC) {
       default: break;
       case ISD::SETOGE:
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
           std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETUGT:
         // Converting this to a min would handle NaNs incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
           break;
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETUGE:
         // Converting this to a min would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
         LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETGT:
       case ISD::SETGE:
         Opcode = X86ISD::FMIN;
         break;
 
       case ISD::SETULT:
         // Converting this to a max would handle NaNs incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
           break;
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETOLE:
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
           std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETULE:
         // Converting this to a max would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
         LLVM_FALLTHROUGH;
       case ISD::SETOLT:
       case ISD::SETLT:
       case ISD::SETLE:
         Opcode = X86ISD::FMAX;
         break;
       }
     }
 
     if (Opcode)
       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   }
 
   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
   // lowering on KNL. In this case we convert it to
   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
   // The same situation for all 128 and 256-bit vectors of i8 and i16.
   // Since SKX these selects have a proper lowering.
   if (Subtarget.hasAVX512() && CondVT.isVector() &&
       CondVT.getVectorElementType() == MVT::i1 &&
       (VT.is128BitVector() || VT.is256BitVector()) &&
       (VT.getVectorElementType() == MVT::i8 ||
        VT.getVectorElementType() == MVT::i16) &&
       !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
     DCI.AddToWorklist(Cond.getNode());
     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
   }
 
   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
     return V;
 
   // Canonicalize max and min:
   // (x > y) ? x : y -> (x >= y) ? x : y
   // (x < y) ? x : y -> (x <= y) ? x : y
   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
   // the need for an extra compare
   // against zero. e.g.
   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
   // subl   %esi, %edi
   // testl  %edi, %edi
   // movl   $0, %eax
   // cmovgl %edi, %eax
   // =>
   // xorl   %eax, %eax
   // subl   %esi, $edi
   // cmovsl %eax, %edi
   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
     switch (CC) {
     default: break;
     case ISD::SETLT:
     case ISD::SETGT: {
       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
     }
     }
   }
 
   // Early exit check
   if (!TLI.isTypeLegal(VT))
     return SDValue();
 
   // Match VSELECTs into subs with unsigned saturation.
   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
        (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
     // left side invert the predicate to simplify logic below.
     SDValue Other;
     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
       Other = RHS;
       CC = ISD::getSetCCInverse(CC, true);
     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
       Other = LHS;
     }
 
     if (Other.getNode() && Other->getNumOperands() == 2 &&
         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
       SDValue CondRHS = Cond->getOperand(1);
 
       // Look for a general sub with unsigned saturation first.
       // x >= y ? x-y : 0 --> subus x, y
       // x >  y ? x-y : 0 --> subus x, y
       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
 
       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
               // If the RHS is a constant we have to reverse the const
               // canonicalization.
               // x > C-1 ? x+-C : 0 --> subus x, C
               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
                   CondRHSConst->getAPIntValue() ==
                       (-OpRHSConst->getAPIntValue() - 1))
                 return DAG.getNode(
                     X86ISD::SUBUS, DL, VT, OpLHS,
                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
 
           // Another special case: If C was a sign bit, the sub has been
           // canonicalized into a xor.
           // FIXME: Would it be better to use computeKnownBits to determine
           //        whether it's safe to decanonicalize the xor?
           // x s< 0 ? x^C : 0 --> subus x, C
           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
               OpRHSConst->getAPIntValue().isSignMask())
             // Note that we have to rebuild the RHS constant here to ensure we
             // don't rely on particular values of undef lanes.
             return DAG.getNode(
                 X86ISD::SUBUS, DL, VT, OpLHS,
                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
         }
     }
   }
 
   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
     return V;
 
   // If this is a *dynamic* select (non-constant condition) and we can match
   // this node with one of the variable blend instructions, restructure the
   // condition so that blends can use the high (sign) bit of each element and
   // use SimplifyDemandedBits to simplify the condition operand.
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() &&
       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
     unsigned BitWidth = Cond.getScalarValueSizeInBits();
 
     // Don't optimize vector selects that map to mask-registers.
     if (BitWidth == 1)
       return SDValue();
 
     // We can only handle the cases where VSELECT is directly legal on the
     // subtarget. We custom lower VSELECT nodes with constant conditions and
     // this makes it hard to see whether a dynamic VSELECT will correctly
     // lower, so we both check the operation's status and explicitly handle the
     // cases where a *dynamic* blend will fail even though a constant-condition
     // blend could be custom lowered.
     // FIXME: We should find a better way to handle this class of problems.
     // Potentially, we should combine constant-condition vselect nodes
     // pre-legalization into shuffles and not mark as many types as custom
     // lowered.
     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
       return SDValue();
     // FIXME: We don't support i16-element blends currently. We could and
     // should support them by making *all* the bits in the condition be set
     // rather than just the high bit and using an i8-element blend.
     if (VT.getVectorElementType() == MVT::i16)
       return SDValue();
     // Dynamic blending was only available from SSE4.1 onward.
     if (VT.is128BitVector() && !Subtarget.hasSSE41())
       return SDValue();
     // Byte blends are only available in AVX2
     if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
       return SDValue();
     // There are no 512-bit blend instructions that use sign bits.
     if (VT.is512BitVector())
       return SDValue();
 
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
     APInt DemandedMask(APInt::getSignMask(BitWidth));
     KnownBits Known;
     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                           !DCI.isBeforeLegalizeOps());
     if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
         TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
       // If we changed the computation somewhere in the DAG, this change will
       // affect all users of Cond. Make sure it is fine and update all the nodes
       // so that we do not use the generic VSELECT anymore. Otherwise, we may
       // perform wrong optimizations as we messed with the actual expectation
       // for the vector boolean values.
       if (Cond != TLO.Old) {
         // Check all uses of the condition operand to check whether it will be
         // consumed by non-BLEND instructions. Those may require that all bits
         // are set properly.
-        for (SDNode *U : Cond->uses()) {
+        for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
+             UI != UE; ++UI) {
           // TODO: Add other opcodes eventually lowered into BLEND.
-          if (U->getOpcode() != ISD::VSELECT)
+          if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
             return SDValue();
         }
 
         // Update all users of the condition before committing the change, so
         // that the VSELECT optimizations that expect the correct vector boolean
         // value will not be triggered.
         for (SDNode *U : Cond->uses()) {
           SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
                                    U->getValueType(0), Cond, U->getOperand(1),
                                    U->getOperand(2));
           DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
         }
         DCI.CommitTargetLoweringOpt(TLO);
         return SDValue();
       }
       // Only Cond (rather than other nodes in the computation chain) was
       // changed. Change the condition just for N to keep the opportunity to
       // optimize all other users their own way.
       SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
       return SDValue();
     }
   }
 
   // Look for vselects with LHS/RHS being bitcasted from an operation that
   // can be executed on another type. Push the bitcast to the inputs of
   // the operation. This exposes opportunities for using masking instructions.
   if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
       CondVT.getVectorElementType() == MVT::i1) {
     if (combineBitcastForMaskedOp(LHS, DAG, DCI))
       return SDValue(N, 0);
     if (combineBitcastForMaskedOp(RHS, DAG, DCI))
       return SDValue(N, 0);
   }
 
   // Custom action for SELECT MMX
   if (VT == MVT::x86mmx) {
     LHS = DAG.getBitcast(MVT::i64, LHS);
     RHS = DAG.getBitcast(MVT::i64, RHS);
     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
     return DAG.getBitcast(VT, newSelect);
   }
 
   return SDValue();
 }
 
 /// Combine:
 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
 /// to:
 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
 /// Note that this is only legal for some op/cc combinations.
 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
                                        SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget) {
   // This combine only operates on CMP-like nodes.
   if (!(Cmp.getOpcode() == X86ISD::CMP ||
         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
     return SDValue();
 
   // Can't replace the cmp if it has more uses than the one we're looking at.
   // FIXME: We would like to be able to handle this, but would need to make sure
   // all uses were updated.
   if (!Cmp.hasOneUse())
     return SDValue();
 
   // This only applies to variations of the common case:
   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
   // Using the proper condcodes (see below), overflow is checked for.
 
   // FIXME: We can generalize both constraints:
   // - XOR/OR/AND (if they were made to survive AtomicExpand)
   // - LHS != 1
   // if the result is compared.
 
   SDValue CmpLHS = Cmp.getOperand(0);
   SDValue CmpRHS = Cmp.getOperand(1);
 
   if (!CmpLHS.hasOneUse())
     return SDValue();
 
   unsigned Opc = CmpLHS.getOpcode();
   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
     return SDValue();
 
   SDValue OpRHS = CmpLHS.getOperand(2);
   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
   if (!OpRHSC)
     return SDValue();
 
   APInt Addend = OpRHSC->getAPIntValue();
   if (Opc == ISD::ATOMIC_LOAD_SUB)
     Addend = -Addend;
 
   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
   if (!CmpRHSC)
     return SDValue();
 
   APInt Comparison = CmpRHSC->getAPIntValue();
 
   // If the addend is the negation of the comparison value, then we can do
   // a full comparison by emitting the atomic arithmetic as a locked sub.
   if (Comparison == -Addend) {
     // The CC is fine, but we need to rewrite the LHS of the comparison as an
     // atomic sub.
     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
     auto AtomicSub = DAG.getAtomic(
         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
         /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
         AN->getMemOperand());
     // If the comparision uses the CF flag we can't use INC/DEC instructions.
     bool NeedCF = false;
     switch (CC) {
     default: break;
     case X86::COND_A: case X86::COND_AE:
     case X86::COND_B: case X86::COND_BE:
       NeedCF = true;
       break;
     }
     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
                                   DAG.getUNDEF(CmpLHS.getValueType()));
     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
     return LockOp;
   }
 
   // We can handle comparisons with zero in a number of cases by manipulating
   // the CC used.
   if (!Comparison.isNullValue())
     return SDValue();
 
   if (CC == X86::COND_S && Addend == 1)
     CC = X86::COND_LE;
   else if (CC == X86::COND_NS && Addend == 1)
     CC = X86::COND_G;
   else if (CC == X86::COND_G && Addend == -1)
     CC = X86::COND_GE;
   else if (CC == X86::COND_LE && Addend == -1)
     CC = X86::COND_L;
   else
     return SDValue();
 
   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
                                 DAG.getUNDEF(CmpLHS.getValueType()));
   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
   return LockOp;
 }
 
 // Check whether a boolean test is testing a boolean value generated by
 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
 // code.
 //
 // Simplify the following patterns:
 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
 // to (Op EFLAGS Cond)
 //
 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
 // to (Op EFLAGS !Cond)
 //
 // where Op could be BRCOND or CMOV.
 //
 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   // This combine only operates on CMP-like nodes.
   if (!(Cmp.getOpcode() == X86ISD::CMP ||
         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
     return SDValue();
 
   // Quit if not used as a boolean value.
   if (CC != X86::COND_E && CC != X86::COND_NE)
     return SDValue();
 
   // Check CMP operands. One of them should be 0 or 1 and the other should be
   // an SetCC or extended from it.
   SDValue Op1 = Cmp.getOperand(0);
   SDValue Op2 = Cmp.getOperand(1);
 
   SDValue SetCC;
   const ConstantSDNode* C = nullptr;
   bool needOppositeCond = (CC == X86::COND_E);
   bool checkAgainstTrue = false; // Is it a comparison against 1?
 
   if ((C = dyn_cast<ConstantSDNode>(Op1)))
     SetCC = Op2;
   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
     SetCC = Op1;
   else // Quit if all operands are not constants.
     return SDValue();
 
   if (C->getZExtValue() == 1) {
     needOppositeCond = !needOppositeCond;
     checkAgainstTrue = true;
   } else if (C->getZExtValue() != 0)
     // Quit if the constant is neither 0 or 1.
     return SDValue();
 
   bool truncatedToBoolWithAnd = false;
   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
          SetCC.getOpcode() == ISD::TRUNCATE ||
          SetCC.getOpcode() == ISD::AND) {
     if (SetCC.getOpcode() == ISD::AND) {
       int OpIdx = -1;
       if (isOneConstant(SetCC.getOperand(0)))
         OpIdx = 1;
       if (isOneConstant(SetCC.getOperand(1)))
         OpIdx = 0;
       if (OpIdx < 0)
         break;
       SetCC = SetCC.getOperand(OpIdx);
       truncatedToBoolWithAnd = true;
     } else
       SetCC = SetCC.getOperand(0);
   }
 
   switch (SetCC.getOpcode()) {
   case X86ISD::SETCC_CARRY:
     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
     // truncated to i1 using 'and'.
     if (checkAgainstTrue && !truncatedToBoolWithAnd)
       break;
     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
            "Invalid use of SETCC_CARRY!");
     LLVM_FALLTHROUGH;
   case X86ISD::SETCC:
     // Set the condition code or opposite one if necessary.
     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
     if (needOppositeCond)
       CC = X86::GetOppositeBranchCondition(CC);
     return SetCC.getOperand(1);
   case X86ISD::CMOV: {
     // Check whether false/true value has canonical one, i.e. 0 or 1.
     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
     // Quit if true value is not a constant.
     if (!TVal)
       return SDValue();
     // Quit if false value is not a constant.
     if (!FVal) {
       SDValue Op = SetCC.getOperand(0);
       // Skip 'zext' or 'trunc' node.
       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
           Op.getOpcode() == ISD::TRUNCATE)
         Op = Op.getOperand(0);
       // A special case for rdrand/rdseed, where 0 is set if false cond is
       // found.
       if ((Op.getOpcode() != X86ISD::RDRAND &&
            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
         return SDValue();
     }
     // Quit if false value is not the constant 0 or 1.
     bool FValIsFalse = true;
     if (FVal && FVal->getZExtValue() != 0) {
       if (FVal->getZExtValue() != 1)
         return SDValue();
       // If FVal is 1, opposite cond is needed.
       needOppositeCond = !needOppositeCond;
       FValIsFalse = false;
     }
     // Quit if TVal is not the constant opposite of FVal.
     if (FValIsFalse && TVal->getZExtValue() != 1)
       return SDValue();
     if (!FValIsFalse && TVal->getZExtValue() != 0)
       return SDValue();
     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
     if (needOppositeCond)
       CC = X86::GetOppositeBranchCondition(CC);
     return SetCC.getOperand(3);
   }
   }
 
   return SDValue();
 }
 
 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
 /// Match:
 ///   (X86or (X86setcc) (X86setcc))
 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
                                            X86::CondCode &CC1, SDValue &Flags,
                                            bool &isAnd) {
   if (Cond->getOpcode() == X86ISD::CMP) {
     if (!isNullConstant(Cond->getOperand(1)))
       return false;
 
     Cond = Cond->getOperand(0);
   }
 
   isAnd = false;
 
   SDValue SetCC0, SetCC1;
   switch (Cond->getOpcode()) {
   default: return false;
   case ISD::AND:
   case X86ISD::AND:
     isAnd = true;
     LLVM_FALLTHROUGH;
   case ISD::OR:
   case X86ISD::OR:
     SetCC0 = Cond->getOperand(0);
     SetCC1 = Cond->getOperand(1);
     break;
   };
 
   // Make sure we have SETCC nodes, using the same flags value.
   if (SetCC0.getOpcode() != X86ISD::SETCC ||
       SetCC1.getOpcode() != X86ISD::SETCC ||
       SetCC0->getOperand(1) != SetCC1->getOperand(1))
     return false;
 
   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
   Flags = SetCC0->getOperand(1);
   return true;
 }
 
 // When legalizing carry, we create carries via add X, -1
 // If that comes from an actual carry, via setcc, we use the
 // carry directly.
 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
   if (EFLAGS.getOpcode() == X86ISD::ADD) {
     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
       SDValue Carry = EFLAGS.getOperand(0);
       while (Carry.getOpcode() == ISD::TRUNCATE ||
              Carry.getOpcode() == ISD::ZERO_EXTEND ||
              Carry.getOpcode() == ISD::SIGN_EXTEND ||
              Carry.getOpcode() == ISD::ANY_EXTEND ||
              (Carry.getOpcode() == ISD::AND &&
               isOneConstant(Carry.getOperand(1))))
         Carry = Carry.getOperand(0);
       if (Carry.getOpcode() == X86ISD::SETCC ||
           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
         if (Carry.getConstantOperandVal(0) == X86::COND_B)
           return Carry.getOperand(1);
       }
     }
   }
 
   return SDValue();
 }
 
 /// Optimize an EFLAGS definition used according to the condition code \p CC
 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
 /// uses of chain values.
 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
                                   SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
   if (CC == X86::COND_B)
     if (SDValue Flags = combineCarryThroughADD(EFLAGS))
       return Flags;
 
   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
     return R;
   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
 }
 
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   SDLoc DL(N);
 
   SDValue FalseOp = N->getOperand(0);
   SDValue TrueOp = N->getOperand(1);
   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   SDValue Cond = N->getOperand(3);
 
   if (CC == X86::COND_E || CC == X86::COND_NE) {
     switch (Cond.getOpcode()) {
     default: break;
     case X86ISD::BSR:
     case X86ISD::BSF:
       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
         return (CC == X86::COND_E) ? FalseOp : TrueOp;
     }
   }
 
   // Try to simplify the EFLAGS and condition code operands.
   // We can't always do this as FCMOV only supports a subset of X86 cond.
   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
         Flags};
       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
     }
   }
 
   // If this is a select between two integer constants, try to do some
   // optimizations.  Note that the operands are ordered the opposite of SELECT
   // operands.
   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
       // larger than FalseC (the false value).
       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueC, FalseC);
         std::swap(TrueOp, FalseOp);
       }
 
       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
       // This is efficient for any integer data type (including i8/i16) and
       // shift amount.
       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
         Cond = getSETCC(CC, Cond, DL, DAG);
 
         // Zero extend the condition if needed.
         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
 
         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
                            DAG.getConstant(ShAmt, DL, MVT::i8));
         return Cond;
       }
 
       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
       // for any integer data type, including i8/i16.
       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
         Cond = getSETCC(CC, Cond, DL, DAG);
 
         // Zero extend the condition if needed.
         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
                            FalseC->getValueType(0), Cond);
         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                            SDValue(FalseC, 0));
         return Cond;
       }
 
       // Optimize cases that will turn into an LEA instruction.  This requires
       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
 
         bool isFastMultiplier = false;
         if (Diff < 10) {
           switch ((unsigned char)Diff) {
           default: break;
           case 1:  // result = add base, cond
           case 2:  // result = lea base(    , cond*2)
           case 3:  // result = lea base(cond, cond*2)
           case 4:  // result = lea base(    , cond*4)
           case 5:  // result = lea base(cond, cond*4)
           case 8:  // result = lea base(    , cond*8)
           case 9:  // result = lea base(cond, cond*8)
             isFastMultiplier = true;
             break;
           }
         }
 
         if (isFastMultiplier) {
           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
           Cond = getSETCC(CC, Cond, DL ,DAG);
           // Zero extend the condition if needed.
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
                              Cond);
           // Scale the condition by the difference.
           if (Diff != 1)
             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
                                DAG.getConstant(Diff, DL, Cond.getValueType()));
 
           // Add the base if non-zero.
           if (FalseC->getAPIntValue() != 0)
             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                                SDValue(FalseC, 0));
           return Cond;
         }
       }
     }
   }
 
   // Handle these cases:
   //   (select (x != c), e, c) -> select (x != c), e, x),
   //   (select (x == c), c, e) -> select (x == c), x, e)
   // where the c is an integer constant, and the "select" is the combination
   // of CMOV and CMP.
   //
   // The rationale for this change is that the conditional-move from a constant
   // needs two instructions, however, conditional-move from a register needs
   // only one instruction.
   //
   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
   //  some instruction-combining opportunities. This opt needs to be
   //  postponed as late as possible.
   //
   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
     // the DCI.xxxx conditions are provided to postpone the optimization as
     // late as possible.
 
     ConstantSDNode *CmpAgainst = nullptr;
     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
         !isa<ConstantSDNode>(Cond.getOperand(0))) {
 
       if (CC == X86::COND_NE &&
           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueOp, FalseOp);
       }
 
       if (CC == X86::COND_E &&
           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
                           DAG.getConstant(CC, DL, MVT::i8), Cond };
         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
       }
     }
   }
 
   // Fold and/or of setcc's to double CMOV:
   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
   //
   // This combine lets us generate:
   //   cmovcc1 (jcc1 if we don't have CMOV)
   //   cmovcc2 (same)
   // instead of:
   //   setcc1
   //   setcc2
   //   and/or
   //   cmovne (jne if we don't have CMOV)
   // When we can't use the CMOV instruction, it might increase branch
   // mispredicts.
   // When we can use CMOV, or when there is no mispredict, this improves
   // throughput and reduces register pressure.
   //
   if (CC == X86::COND_NE) {
     SDValue Flags;
     X86::CondCode CC0, CC1;
     bool isAndSetCC;
     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
       if (isAndSetCC) {
         std::swap(FalseOp, TrueOp);
         CC0 = X86::GetOppositeBranchCondition(CC0);
         CC1 = X86::GetOppositeBranchCondition(CC1);
       }
 
       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
         Flags};
       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
       return CMOV;
     }
   }
 
   return SDValue();
 }
 
 /// Different mul shrinking modes.
 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
 
 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
   EVT VT = N->getOperand(0).getValueType();
   if (VT.getScalarSizeInBits() != 32)
     return false;
 
   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
   unsigned SignBits[2] = {1, 1};
   bool IsPositive[2] = {false, false};
   for (unsigned i = 0; i < 2; i++) {
     SDValue Opd = N->getOperand(i);
 
     // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
     // compute signbits for it separately.
     if (Opd.getOpcode() == ISD::ANY_EXTEND) {
       // For anyextend, it is safe to assume an appropriate number of leading
       // sign/zero bits.
       if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
         SignBits[i] = 25;
       else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
                MVT::i16)
         SignBits[i] = 17;
       else
         return false;
       IsPositive[i] = true;
     } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
       // All the operands of BUILD_VECTOR need to be int constant.
       // Find the smallest value range which all the operands belong to.
       SignBits[i] = 32;
       IsPositive[i] = true;
       for (const SDValue &SubOp : Opd.getNode()->op_values()) {
         if (SubOp.isUndef())
           continue;
         auto *CN = dyn_cast<ConstantSDNode>(SubOp);
         if (!CN)
           return false;
         APInt IntVal = CN->getAPIntValue();
         if (IntVal.isNegative())
           IsPositive[i] = false;
         SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
       }
     } else {
       SignBits[i] = DAG.ComputeNumSignBits(Opd);
       if (Opd.getOpcode() == ISD::ZERO_EXTEND)
         IsPositive[i] = true;
     }
   }
 
   bool AllPositive = IsPositive[0] && IsPositive[1];
   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
   // When ranges are from -128 ~ 127, use MULS8 mode.
   if (MinSignBits >= 25)
     Mode = MULS8;
   // When ranges are from 0 ~ 255, use MULU8 mode.
   else if (AllPositive && MinSignBits >= 24)
     Mode = MULU8;
   // When ranges are from -32768 ~ 32767, use MULS16 mode.
   else if (MinSignBits >= 17)
     Mode = MULS16;
   // When ranges are from 0 ~ 65535, use MULU16 mode.
   else if (AllPositive && MinSignBits >= 16)
     Mode = MULU16;
   else
     return false;
   return true;
 }
 
 /// When the operands of vector mul are extended from smaller size values,
 /// like i8 and i16, the type of mul may be shrinked to generate more
 /// efficient code. Two typical patterns are handled:
 /// Pattern1:
 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
 ///     %5 = mul <N x i32> %2, %4
 ///
 /// Pattern2:
 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
 ///     %5 = mul <N x i32> %2, %4
 ///
 /// There are four mul shrinking modes:
 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
 /// generate pmullw+sext32 for it (MULS8 mode).
 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
 /// generate pmullw+zext32 for it (MULU8 mode).
 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
 /// generate pmullw+pmulhw for it (MULS16 mode).
 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
 /// generate pmullw+pmulhuw for it (MULU16 mode).
 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   // Check for legality
   // pmullw/pmulhw are not supported by SSE.
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   // Check for profitability
   // pmulld is supported since SSE41. It is better to use pmulld
   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
   // the expansion.
   bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
     return SDValue();
 
   ShrinkMode Mode;
   if (!canReduceVMulWidth(N, DAG, Mode))
     return SDValue();
 
   SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getOperand(0).getValueType();
   unsigned NumElts = VT.getVectorNumElements();
   if ((NumElts % 2) != 0)
     return SDValue();
 
   // If the upper 17 bits of each element are zero then we can use PMADD.
   APInt Mask17 = APInt::getHighBitsSet(32, 17);
   if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
       DAG.MaskedValueIsZero(N1, Mask17))
     return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
                        DAG.getBitcast(MVT::v8i16, N1));
 
   unsigned RegSize = 128;
   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
 
   // Shrink the operands of mul.
   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
 
   if (NumElts >= OpsVT.getVectorNumElements()) {
     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
     // lower part is needed.
     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
     if (Mode == MULU8 || Mode == MULS8) {
       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
                          DL, VT, MulLo);
     } else {
       MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
       // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
       // the higher part is also needed.
       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
                                   ReducedVT, NewN0, NewN1);
 
       // Repack the lower part and higher part result of mul into a wider
       // result.
       // Generate shuffle functioning as punpcklwd.
       SmallVector<int, 16> ShuffleMask(NumElts);
       for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
         ShuffleMask[2 * i] = i;
         ShuffleMask[2 * i + 1] = i + NumElts;
       }
       SDValue ResLo =
           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
       ResLo = DAG.getBitcast(ResVT, ResLo);
       // Generate shuffle functioning as punpckhwd.
       for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
         ShuffleMask[2 * i] = i + NumElts / 2;
         ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
       }
       SDValue ResHi =
           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
       ResHi = DAG.getBitcast(ResVT, ResHi);
       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
     }
   } else {
     // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
     // to legalize the mul explicitly because implicit legalization for type
     // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
     // instructions which will not exist when we explicitly legalize it by
     // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
     // <4 x i16> undef).
     //
     // Legalize the operands of mul.
     // FIXME: We may be able to handle non-concatenated vectors by insertion.
     unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
     if ((RegSize % ReducedSizeInBits) != 0)
       return SDValue();
 
     SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
                                  DAG.getUNDEF(ReducedVT));
     Ops[0] = NewN0;
     NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
     Ops[0] = NewN1;
     NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
 
     if (Mode == MULU8 || Mode == MULS8) {
       // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
       // part is needed.
       SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
 
       // convert the type of mul result to VT.
       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
       SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
                                               : ISD::SIGN_EXTEND_VECTOR_INREG,
                                 DL, ResVT, Mul);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
                          DAG.getIntPtrConstant(0, DL));
     } else {
       // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
       // MULU16/MULS16, both parts are needed.
       SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
                                   OpsVT, NewN0, NewN1);
 
       // Repack the lower part and higher part result of mul into a wider
       // result. Make sure the type of mul result is VT.
       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
       SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
       Res = DAG.getBitcast(ResVT, Res);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
                          DAG.getIntPtrConstant(0, DL));
     }
   }
 }
 
 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
                                  EVT VT, SDLoc DL) {
 
   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                                  DAG.getConstant(Mult, DL, VT));
     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
                          DAG.getConstant(Shift, DL, MVT::i8));
     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
                          N->getOperand(0));
     return Result;
   };
 
   auto combineMulMulAddOrSub = [&](bool isAdd) {
     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                                  DAG.getConstant(9, DL, VT));
     Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
                          N->getOperand(0));
     return Result;
   };
 
   switch (MulAmt) {
   default:
     break;
   case 11:
     // mul x, 11 => add ((shl (mul x, 5), 1), x)
     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
   case 21:
     // mul x, 21 => add ((shl (mul x, 5), 2), x)
     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
   case 22:
     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
   case 19:
     // mul x, 19 => sub ((shl (mul x, 5), 2), x)
     return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
   case 13:
     // mul x, 13 => add ((shl (mul x, 3), 2), x)
     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
   case 23:
     // mul x, 13 => sub ((shl (mul x, 3), 3), x)
     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
   case 14:
     // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
                        combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
   case 26:
     // mul x, 26 => sub ((mul (mul x, 9), 3), x)
     return combineMulMulAddOrSub(/*isAdd*/ false);
   case 28:
     // mul x, 28 => add ((mul (mul x, 9), 3), x)
     return combineMulMulAddOrSub(/*isAdd*/ true);
   case 29:
     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
                        combineMulMulAddOrSub(/*isAdd*/ true));
   case 30:
     // mul x, 30 => sub (sub ((shl x, 5), x), x)
     return DAG.getNode(
         ISD::SUB, DL, VT,
         DAG.getNode(ISD::SUB, DL, VT,
                     DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                                 DAG.getConstant(5, DL, MVT::i8)),
                     N->getOperand(0)),
         N->getOperand(0));
   }
   return SDValue();
 }
 
 /// Optimize a single multiply with constant into two operations in order to
 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
   if (!MulConstantOptimization)
     return SDValue();
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction().optForMinSize())
     return SDValue();
 
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
   if (VT != MVT::i64 && VT != MVT::i32)
     return SDValue();
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!C)
     return SDValue();
   uint64_t MulAmt = C->getZExtValue();
   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
     return SDValue();
 
   uint64_t MulAmt1 = 0;
   uint64_t MulAmt2 = 0;
   if ((MulAmt % 9) == 0) {
     MulAmt1 = 9;
     MulAmt2 = MulAmt / 9;
   } else if ((MulAmt % 5) == 0) {
     MulAmt1 = 5;
     MulAmt2 = MulAmt / 5;
   } else if ((MulAmt % 3) == 0) {
     MulAmt1 = 3;
     MulAmt2 = MulAmt / 3;
   }
 
   SDLoc DL(N);
   SDValue NewMul;
   if (MulAmt2 &&
       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
 
     if (isPowerOf2_64(MulAmt2) &&
         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
       // If second multiplifer is pow2, issue it first. We want the multiply by
       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
       // is an add.
       std::swap(MulAmt1, MulAmt2);
 
     if (isPowerOf2_64(MulAmt1))
       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                            DAG.getConstant(MulAmt1, DL, VT));
 
     if (isPowerOf2_64(MulAmt2))
       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                            DAG.getConstant(MulAmt2, DL, VT));
   } else if (!Subtarget.slowLEA())
     NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
 
   if (!NewMul) {
     assert(MulAmt != 0 &&
            MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
            "Both cases that could cause potential overflows should have "
            "already been handled.");
     int64_t SignMulAmt = C->getSExtValue();
     if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
         (SignMulAmt != -INT64_MAX)) {
       int NumSign = SignMulAmt > 0 ? 1 : -1;
       bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
       bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
       if (IsPowerOf2_64PlusOne) {
         // (mul x, 2^N + 1) => (add (shl x, N), x)
         NewMul = DAG.getNode(
             ISD::ADD, DL, VT, N->getOperand(0),
             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                         DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
                                         MVT::i8)));
       } else if (IsPowerOf2_64MinusOne) {
         // (mul x, 2^N - 1) => (sub (shl x, N), x)
         NewMul = DAG.getNode(
             ISD::SUB, DL, VT,
             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                         DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
                                         MVT::i8)),
             N->getOperand(0));
       }
       // To negate, subtract the number from zero
       if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
         NewMul =
             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
     }
   }
 
   if (NewMul)
     // Do not add new nodes to DAG combiner worklist.
     DCI.CombineTo(N, NewMul, false);
 
   return SDValue();
 }
 
 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   // since the result of setcc_c is all zero's or all ones.
   if (VT.isInteger() && !VT.isVector() &&
       N1C && N0.getOpcode() == ISD::AND &&
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue N00 = N0.getOperand(0);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask <<= N1C->getAPIntValue();
     bool MaskOK = false;
     // We can handle cases concerning bit-widening nodes containing setcc_c if
     // we carefully interrogate the mask to make sure we are semantics
     // preserving.
     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
     // of the underlying setcc_c operation if the setcc_c was zero extended.
     // Consider the following example:
     //   zext(setcc_c)                 -> i32 0x0000FFFF
     //   c1                            -> i32 0x0000FFFF
     //   c2                            -> i32 0x00000001
     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
       MaskOK = true;
     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
       MaskOK = true;
     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
                 N00.getOpcode() == ISD::ANY_EXTEND) &&
                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
     }
     if (MaskOK && Mask != 0) {
       SDLoc DL(N);
       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
     }
   }
 
   // Hardware support for vector shifts is sparse which makes us scalarize the
   // vector operations in many cases. Also, on sandybridge ADD is faster than
   // shl.
   // (shl V, 1) -> add V,V
   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
       assert(N0.getValueType().isVector() && "Invalid vector shift type");
       // We shift all of the values by one. In many cases we do not have
       // hardware support for this operation. This is better expressed as an ADD
       // of two values.
       if (N1SplatC->getAPIntValue() == 1)
         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
     }
 
   return SDValue();
 }
 
 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   unsigned Size = VT.getSizeInBits();
 
   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
   // depending on sign of (SarConst - [56,48,32,24,16])
 
   // sexts in X86 are MOVs. The MOVs have the same code size
   // as above SHIFTs (only SHIFT on 1 has lower code size).
   // However the MOVs have 2 advantages to a SHIFT:
   // 1. MOVs can write to a register that differs from source
   // 2. MOVs accept memory operands
 
   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
       N0.getOperand(1).getOpcode() != ISD::Constant)
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
   EVT CVT = N1.getValueType();
 
   if (SarConst.isNegative())
     return SDValue();
 
   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
     unsigned ShiftSize = SVT.getSizeInBits();
     // skipping types without corresponding sext/zext and
     // ShlConst that is not one of [56,48,32,24,16]
     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
       continue;
     SDLoc DL(N);
     SDValue NN =
         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
     SarConst = SarConst - (Size - ShiftSize);
     if (SarConst == 0)
       return NN;
     else if (SarConst.isNegative())
       return DAG.getNode(ISD::SHL, DL, VT, NN,
                          DAG.getConstant(-SarConst, DL, CVT));
     else
       return DAG.getNode(ISD::SRA, DL, VT, NN,
                          DAG.getConstant(SarConst, DL, CVT));
   }
   return SDValue();
 }
 
 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
 
   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
   // TODO: This is a generic DAG combine that became an x86-only combine to
   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
   // and-not ('andn').
   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
     return SDValue();
 
   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   if (!ShiftC || !AndC)
     return SDValue();
 
   // If we can shrink the constant mask below 8-bits or 32-bits, then this
   // transform should reduce code size. It may also enable secondary transforms
   // from improved known-bits analysis or instruction selection.
   APInt MaskVal = AndC->getAPIntValue();
   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
   unsigned OldMaskSize = MaskVal.getMinSignedBits();
   unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
       (OldMaskSize > 32 && NewMaskSize <= 32)) {
     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
     SDLoc DL(N);
     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
   }
   return SDValue();
 }
 
 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
   if (N->getOpcode() == ISD::SHL)
     if (SDValue V = combineShiftLeft(N, DAG))
       return V;
 
   if (N->getOpcode() == ISD::SRA)
     if (SDValue V = combineShiftRightArithmetic(N, DAG))
       return V;
 
   if (N->getOpcode() == ISD::SRL)
     if (SDValue V = combineShiftRightLogical(N, DAG))
       return V;
 
   return SDValue();
 }
 
 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
          "Unexpected shift opcode");
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
          "Unexpected PACKSS/PACKUS input type");
 
   // Constant Folding.
   APInt UndefElts0, UndefElts1;
   SmallVector<APInt, 32> EltBits0, EltBits1;
   if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
       (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
     unsigned NumLanes = VT.getSizeInBits() / 128;
     unsigned NumDstElts = VT.getVectorNumElements();
     unsigned NumSrcElts = NumDstElts / 2;
     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
     bool IsSigned = (X86ISD::PACKSS == Opcode);
 
     APInt Undefs(NumDstElts, 0);
     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
 
         if (UndefElts[SrcIdx]) {
           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
           continue;
         }
 
         APInt &Val = EltBits[SrcIdx];
         if (IsSigned) {
           // PACKSS: Truncate signed value with signed saturation.
           // Source values less than dst minint are saturated to minint.
           // Source values greater than dst maxint are saturated to maxint.
           if (Val.isSignedIntN(DstBitsPerElt))
             Val = Val.trunc(DstBitsPerElt);
           else if (Val.isNegative())
             Val = APInt::getSignedMinValue(DstBitsPerElt);
           else
             Val = APInt::getSignedMaxValue(DstBitsPerElt);
         } else {
           // PACKUS: Truncate signed value with unsigned saturation.
           // Source values less than zero are saturated to zero.
           // Source values greater than dst maxuint are saturated to maxuint.
           if (Val.isIntN(DstBitsPerElt))
             Val = Val.trunc(DstBitsPerElt);
           else if (Val.isNegative())
             Val = APInt::getNullValue(DstBitsPerElt);
           else
             Val = APInt::getAllOnesValue(DstBitsPerElt);
         }
         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
       }
     }
 
     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
   // Attempt to combine as shuffle.
   SDValue Op(N, 0);
   if (SDValue Res = combineX86ShufflesRecursively(
           {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
           /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
     DCI.CombineTo(N, Res);
     return SDValue();
   }
 
   return SDValue();
 }
 
 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
           X86ISD::VSRLI == Opcode) &&
          "Unexpected shift opcode");
   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
          "Unexpected value type");
 
   // Out of range logical bit shifts are guaranteed to be zero.
   // Out of range arithmetic bit shifts splat the sign bit.
   APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
   if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
     if (LogicalShift)
       return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
     else
       ShiftVal = NumBitsPerElt - 1;
   }
 
   // Shift N0 by zero -> N0.
   if (!ShiftVal)
     return N0;
 
   // Shift zero -> zero.
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
     return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
 
   // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
   // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
   // TODO - support other sra opcodes as needed.
   if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
       N0.getOpcode() == X86ISD::VSRAI)
     return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
 
   // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
   if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
       N1 == N0.getOperand(1)) {
     SDValue N00 = N0.getOperand(0);
     unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
     if (ShiftVal.ult(NumSignBits))
       return N00;
   }
 
   // We can decode 'whole byte' logical bit shifts as shuffles.
   if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
       DCI.CombineTo(N, Res);
       return SDValue();
     }
   }
 
   // Constant Folding.
   APInt UndefElts;
   SmallVector<APInt, 32> EltBits;
   if (N->isOnlyUserOf(N0.getNode()) &&
       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
     assert(EltBits.size() == VT.getVectorNumElements() &&
            "Unexpected shift value type");
     unsigned ShiftImm = ShiftVal.getZExtValue();
     for (APInt &Elt : EltBits) {
       if (X86ISD::VSHLI == Opcode)
         Elt <<= ShiftImm;
       else if (X86ISD::VSRAI == Opcode)
         Elt.ashrInPlace(ShiftImm);
       else
         Elt.lshrInPlace(ShiftImm);
     }
     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
   return SDValue();
 }
 
 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget &Subtarget) {
   assert(
       ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
        (N->getOpcode() == X86ISD::PINSRW &&
         N->getValueType(0) == MVT::v8i16)) &&
       "Unexpected vector insertion");
 
   // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
   SDValue Op(N, 0);
   if (SDValue Res = combineX86ShufflesRecursively(
           {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
           /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
     DCI.CombineTo(N, Res);
     return SDValue();
   }
 
   return SDValue();
 }
 
 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
 /// OR -> CMPNEQSS.
 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget &Subtarget) {
   unsigned opcode;
 
   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   // we're requiring SSE2 for both.
   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDValue CMP0 = N0->getOperand(1);
     SDValue CMP1 = N1->getOperand(1);
     SDLoc DL(N);
 
     // The SETCCs should both refer to the same CMP.
     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
       return SDValue();
 
     SDValue CMP00 = CMP0->getOperand(0);
     SDValue CMP01 = CMP0->getOperand(1);
     EVT     VT    = CMP00.getValueType();
 
     if (VT == MVT::f32 || VT == MVT::f64) {
       bool ExpectingFlags = false;
       // Check for any users that want flags:
       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
            !ExpectingFlags && UI != UE; ++UI)
         switch (UI->getOpcode()) {
         default:
         case ISD::BR_CC:
         case ISD::BRCOND:
         case ISD::SELECT:
           ExpectingFlags = true;
           break;
         case ISD::CopyToReg:
         case ISD::SIGN_EXTEND:
         case ISD::ZERO_EXTEND:
         case ISD::ANY_EXTEND:
           break;
         }
 
       if (!ExpectingFlags) {
         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
 
         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
           X86::CondCode tmp = cc0;
           cc0 = cc1;
           cc1 = tmp;
         }
 
         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
           // FIXME: need symbolic constants for these magic numbers.
           // See X86ATTInstPrinter.cpp:printSSECC().
           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
           if (Subtarget.hasAVX512()) {
             SDValue FSetCC =
                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
                             DAG.getConstant(x86cc, DL, MVT::i8));
             return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
                                N->getSimpleValueType(0), FSetCC,
                                DAG.getIntPtrConstant(0, DL));
           }
           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
                                               CMP00.getValueType(), CMP00, CMP01,
                                               DAG.getConstant(x86cc, DL,
                                                               MVT::i8));
 
           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
 
           if (is64BitFP && !Subtarget.is64Bit()) {
             // On a 32-bit target, we cannot bitcast the 64-bit float to a
             // 64-bit integer, since that's not a legal type. Since
             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
             // bits, but can do this little dance to extract the lowest 32 bits
             // and work with those going forward.
             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
                                            OnesOrZeroesF);
             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
                                         Vector32, DAG.getIntPtrConstant(0, DL));
             IntVT = MVT::i32;
           }
 
           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
                                       DAG.getConstant(1, DL, IntVT));
           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
                                               ANDed);
           return OneBitOfTruth;
         }
       }
     }
   }
   return SDValue();
 }
 
 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND);
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
 
   if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
     return SDValue();
 
   if (N0.getOpcode() == ISD::XOR &&
       ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
 
   if (N1.getOpcode() == ISD::XOR &&
       ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
 
   return SDValue();
 }
 
 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
 // register. In most cases we actually compare or select YMM-sized registers
 // and mixing the two types creates horrible code. This method optimizes
 // some of the transition sequences.
 // Even with AVX-512 this is still useful for removing casts around logical
 // operations on vXi1 mask types.
 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   assert(VT.isVector() && "Expected vector type");
 
   assert((N->getOpcode() == ISD::ANY_EXTEND ||
           N->getOpcode() == ISD::ZERO_EXTEND ||
           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
 
   SDValue Narrow = N->getOperand(0);
   EVT NarrowVT = Narrow.getValueType();
 
   if (Narrow->getOpcode() != ISD::XOR &&
       Narrow->getOpcode() != ISD::AND &&
       Narrow->getOpcode() != ISD::OR)
     return SDValue();
 
   SDValue N0  = Narrow->getOperand(0);
   SDValue N1  = Narrow->getOperand(1);
   SDLoc DL(Narrow);
 
   // The Left side has to be a trunc.
   if (N0.getOpcode() != ISD::TRUNCATE)
     return SDValue();
 
   // The type of the truncated inputs.
   if (N0->getOperand(0).getValueType() != VT)
     return SDValue();
 
   // The right side has to be a 'trunc' or a constant vector.
   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
                   N1.getOperand(0).getValueType() == VT;
   if (!RHSTrunc &&
       !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
     return SDValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
     return SDValue();
 
   // Set N0 and N1 to hold the inputs to the new wide operation.
   N0 = N0->getOperand(0);
   if (RHSTrunc)
     N1 = N1->getOperand(0);
   else
     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
 
   // Generate the wide operation.
   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
   unsigned Opcode = N->getOpcode();
   switch (Opcode) {
   default: llvm_unreachable("Unexpected opcode");
   case ISD::ANY_EXTEND:
     return Op;
   case ISD::ZERO_EXTEND:
     return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
   case ISD::SIGN_EXTEND:
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
                        Op, DAG.getValueType(NarrowVT));
   }
 }
 
 /// If both input operands of a logic op are being cast from floating point
 /// types, try to convert this into a floating point logic node to avoid
 /// unnecessary moves from SSE to integer registers.
 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   unsigned FPOpcode = ISD::DELETED_NODE;
   if (N->getOpcode() == ISD::AND)
     FPOpcode = X86ISD::FAND;
   else if (N->getOpcode() == ISD::OR)
     FPOpcode = X86ISD::FOR;
   else if (N->getOpcode() == ISD::XOR)
     FPOpcode = X86ISD::FXOR;
 
   assert(FPOpcode != ISD::DELETED_NODE &&
          "Unexpected input node for FP logic conversion");
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
        (Subtarget.hasSSE2() && VT == MVT::i64))) {
     SDValue N00 = N0.getOperand(0);
     SDValue N10 = N1.getOperand(0);
     EVT N00Type = N00.getValueType();
     EVT N10Type = N10.getValueType();
     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
       return DAG.getBitcast(VT, FPLogic);
     }
   }
   return SDValue();
 }
 
 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
 /// with a shift-right to eliminate loading the vector constant mask value.
 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
   EVT VT0 = Op0.getValueType();
   EVT VT1 = Op1.getValueType();
 
   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
     return SDValue();
 
   APInt SplatVal;
   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
       !SplatVal.isMask())
     return SDValue();
 
   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
     return SDValue();
 
   unsigned EltBitWidth = VT0.getScalarSizeInBits();
   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
     return SDValue();
 
   SDLoc DL(N);
   unsigned ShiftVal = SplatVal.countTrailingOnes();
   SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
 
 // Get the index node from the lowered DAG of a GEP IR instruction with one
 // indexing dimension.
 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
   if (Ld->isIndexed())
     return SDValue();
 
   SDValue Base = Ld->getBasePtr();
 
   if (Base.getOpcode() != ISD::ADD)
     return SDValue();
 
   SDValue ShiftedIndex = Base.getOperand(0);
 
   if (ShiftedIndex.getOpcode() != ISD::SHL)
     return SDValue();
 
   return ShiftedIndex.getOperand(0);
 
 }
 
 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
     switch (VT.getSizeInBits()) {
     default: return false;
     case 64: return Subtarget.is64Bit() ? true : false;
     case 32: return true;
     }
   }
   return false;
 }
 
 // This function recognizes cases where X86 bzhi instruction can replace and
 // 'and-load' sequence.
 // In case of loading integer value from an array of constants which is defined
 // as follows:
 //
 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
 //
 // then applying a bitwise and on the result with another input.
 // It's equivalent to performing bzhi (zero high bits) on the input, with the
 // same index of the load.
 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
     const X86Subtarget &Subtarget) {
   MVT VT = Node->getSimpleValueType(0);
   SDLoc dl(Node);
 
   // Check if subtarget has BZHI instruction for the node's type
   if (!hasBZHI(Subtarget, VT))
     return SDValue();
 
   // Try matching the pattern for both operands.
   for (unsigned i = 0; i < 2; i++) {
     SDValue N = Node->getOperand(i);
     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
 
      // continue if the operand is not a load instruction
     if (!Ld)
       return SDValue();
 
     const Value *MemOp = Ld->getMemOperand()->getValue();
 
     if (!MemOp)
       return SDValue();
 
     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
 
           Constant *Init = GV->getInitializer();
           Type *Ty = Init->getType();
           if (!isa<ConstantDataArray>(Init) ||
               !Ty->getArrayElementType()->isIntegerTy() ||
               Ty->getArrayElementType()->getScalarSizeInBits() !=
                   VT.getSizeInBits() ||
               Ty->getArrayNumElements() >
                   Ty->getArrayElementType()->getScalarSizeInBits())
             continue;
 
           // Check if the array's constant elements are suitable to our case.
           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
           bool ConstantsMatch = true;
           for (uint64_t j = 0; j < ArrayElementCount; j++) {
             ConstantInt *Elem =
                 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
               ConstantsMatch = false;
               break;
             }
           }
           if (!ConstantsMatch)
             continue;
 
           // Do the transformation (For 32-bit type):
           // -> (and (load arr[idx]), inp)
           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
           //    that will be replaced with one bzhi instruction.
           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
 
           // Get the Node which indexes into the array.
           SDValue Index = getIndexFromUnindexedLoad(Ld);
           if (!Index)
             return SDValue();
           Index = DAG.getZExtOrTrunc(Index, dl, VT);
 
           SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
 
           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
 
           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
         }
       }
     }
   }
   return SDValue();
 }
 
 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
 
   // If this is SSE1 only convert to FAND to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
     return DAG.getBitcast(
         MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
   }
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
     return R;
 
   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
     return ShiftRight;
 
   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
     return R;
 
   // Attempt to recursively combine a bitmask AND with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
       DCI.CombineTo(N, Res);
       return SDValue();
     }
   }
 
   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
   if ((VT.getScalarSizeInBits() % 8) == 0 &&
       N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
     SDValue BitMask = N->getOperand(1);
     SDValue SrcVec = N->getOperand(0).getOperand(0);
     EVT SrcVecVT = SrcVec.getValueType();
 
     // Check that the constant bitmask masks whole bytes.
     APInt UndefElts;
     SmallVector<APInt, 64> EltBits;
     if (VT == SrcVecVT.getScalarType() &&
         N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
         llvm::all_of(EltBits, [](APInt M) {
           return M.isNullValue() || M.isAllOnesValue();
         })) {
       unsigned NumElts = SrcVecVT.getVectorNumElements();
       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
       unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
 
       // Create a root shuffle mask from the byte mask and the extracted index.
       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
       for (unsigned i = 0; i != Scale; ++i) {
         if (UndefElts[i])
           continue;
         int VecIdx = Scale * Idx + i;
         ShuffleMask[VecIdx] =
             EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
       }
 
       if (SDValue Shuffle = combineX86ShufflesRecursively(
               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
               /*HasVarMask*/ false, DAG, DCI, Subtarget))
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
                            N->getOperand(0).getOperand(1));
     }
   }
 
   return SDValue();
 }
 
 // Try to fold:
 //   (or (and (m, y), (pandn m, x)))
 // into:
 //   (vselect m, x, y)
 // As a special case, try to fold:
 //   (or (and (m, (sub 0, x)), (pandn m, x)))
 // into:
 //   (sub (xor X, M), M)
 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
         (VT.is256BitVector() && Subtarget.hasInt256())))
     return SDValue();
 
   // Canonicalize AND to LHS.
   if (N1.getOpcode() == ISD::AND)
     std::swap(N0, N1);
 
   // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
   // ANDNP combine allows other combines to happen that prevent matching.
   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
     return SDValue();
 
   SDValue Mask = N1.getOperand(0);
   SDValue X = N1.getOperand(1);
   SDValue Y;
   if (N0.getOperand(0) == Mask)
     Y = N0.getOperand(1);
   if (N0.getOperand(1) == Mask)
     Y = N0.getOperand(0);
 
   // Check to see if the mask appeared in both the AND and ANDNP.
   if (!Y.getNode())
     return SDValue();
 
   // Validate that X, Y, and Mask are bitcasts, and see through them.
   Mask = peekThroughBitcasts(Mask);
   X = peekThroughBitcasts(X);
   Y = peekThroughBitcasts(Y);
 
   EVT MaskVT = Mask.getValueType();
   unsigned EltBits = MaskVT.getScalarSizeInBits();
 
   // TODO: Attempt to handle floating point cases as well?
   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
     return SDValue();
 
   SDLoc DL(N);
 
   // Try to match:
   //   (or (and (M, (sub 0, X)), (pandn M, X)))
   // which is a special case of vselect:
   //   (vselect M, (sub 0, X), X)
   // Per:
   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
   // We know that, if fNegate is 0 or 1:
   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
   //
   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
   // This lets us transform our vselect to:
   //   (add (xor X, M), (and M, 1))
   // And further to:
   //   (sub (xor X, M), M)
   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
       DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
     auto IsNegV = [](SDNode *N, SDValue V) {
       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
     };
     SDValue V;
     if (IsNegV(Y.getNode(), X))
       V = X;
     else if (IsNegV(X.getNode(), Y))
       V = Y;
 
     if (V) {
       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
       SDValue SubOp2 = Mask;
 
       // If the negate was on the false side of the select, then
       // the operands of the SUB need to be swapped. PR 27251.
       // This is because the pattern being matched above is
       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
       // but if the pattern matched was
       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
       // pattern also needs to be a negation of the replacement pattern above.
       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
       // sub accomplishes the negation of the replacement pattern.
       if (V == Y)
          std::swap(SubOp1, SubOp2);
 
       SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
       return DAG.getBitcast(VT, Res);
     }
   }
 
   // PBLENDVB is only available on SSE 4.1.
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
 
   X = DAG.getBitcast(BlendVT, X);
   Y = DAG.getBitcast(BlendVT, Y);
   Mask = DAG.getBitcast(BlendVT, Mask);
   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
   return DAG.getBitcast(VT, Mask);
 }
 
 // Helper function for combineOrCmpEqZeroToCtlzSrl
 // Transforms:
 //   seteq(cmp x, 0)
 //   into:
 //   srl(ctlz x), log2(bitsize(x))
 // Input pattern is checked by caller.
 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
                                           SelectionDAG &DAG) {
   SDValue Cmp = Op.getOperand(1);
   EVT VT = Cmp.getOperand(0).getValueType();
   unsigned Log2b = Log2_32(VT.getSizeInBits());
   SDLoc dl(Op);
   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
   // The result of the shift is true or false, and on X86, the 32-bit
   // encoding of shr and lzcnt is more desirable.
   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
                             DAG.getConstant(Log2b, dl, VT));
   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
 }
 
 // Try to transform:
 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
 //   into:
 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
 // Will also attempt to match more generic cases, eg:
 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
 // Only applies if the target supports the FastLZCNT feature.
 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
     return SDValue();
 
   auto isORCandidate = [](SDValue N) {
     return (N->getOpcode() == ISD::OR && N->hasOneUse());
   };
 
   // Check the zero extend is extending to 32-bit or more. The code generated by
   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
   // instructions to clear the upper bits.
   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
       !isORCandidate(N->getOperand(0)))
     return SDValue();
 
   // Check the node matches: setcc(eq, cmp 0)
   auto isSetCCCandidate = [](SDValue N) {
     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
            N->getOperand(1).getOpcode() == X86ISD::CMP &&
            isNullConstant(N->getOperand(1).getOperand(1)) &&
            N->getOperand(1).getValueType().bitsGE(MVT::i32);
   };
 
   SDNode *OR = N->getOperand(0).getNode();
   SDValue LHS = OR->getOperand(0);
   SDValue RHS = OR->getOperand(1);
 
   // Save nodes matching or(or, setcc(eq, cmp 0)).
   SmallVector<SDNode *, 2> ORNodes;
   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
     ORNodes.push_back(OR);
     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
     LHS = OR->getOperand(0);
     RHS = OR->getOperand(1);
   }
 
   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
       !isORCandidate(SDValue(OR, 0)))
     return SDValue();
 
   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
   // to
   // or(srl(ctlz),srl(ctlz)).
   // The dag combiner can then fold it into:
   // srl(or(ctlz, ctlz)).
   EVT VT = OR->getValueType(0);
   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
   SDValue Ret, NewRHS;
   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
 
   if (!Ret)
     return SDValue();
 
   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
   while (ORNodes.size() > 0) {
     OR = ORNodes.pop_back_val();
     LHS = OR->getOperand(0);
     RHS = OR->getOperand(1);
     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
     if (RHS->getOpcode() == ISD::OR)
       std::swap(LHS, RHS);
     EVT VT = OR->getValueType(0);
     SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
     if (!NewRHS)
       return SDValue();
     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
   }
 
   if (Ret)
     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 
   return Ret;
 }
 
 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
                          const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   // If this is SSE1 only convert to FOR to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
     return DAG.getBitcast(MVT::v4i32,
                           DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
                                       DAG.getBitcast(MVT::v4f32, N0),
                                       DAG.getBitcast(MVT::v4f32, N1)));
   }
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
     return R;
 
   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
 
   // SHLD/SHRD instructions have lower register pressure, but on some
   // platforms they have higher latency than the equivalent
   // series of shifts/or that would otherwise be generated.
   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
   // have higher latencies and we are not optimizing for size.
   if (!OptForSize && Subtarget.isSHLDSlow())
     return SDValue();
 
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
     return SDValue();
   if (!N0.hasOneUse() || !N1.hasOneUse())
     return SDValue();
 
   SDValue ShAmt0 = N0.getOperand(1);
   if (ShAmt0.getValueType() != MVT::i8)
     return SDValue();
   SDValue ShAmt1 = N1.getOperand(1);
   if (ShAmt1.getValueType() != MVT::i8)
     return SDValue();
   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
     ShAmt0 = ShAmt0.getOperand(0);
   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
     ShAmt1 = ShAmt1.getOperand(0);
 
   SDLoc DL(N);
   unsigned Opc = X86ISD::SHLD;
   SDValue Op0 = N0.getOperand(0);
   SDValue Op1 = N1.getOperand(0);
   if (ShAmt0.getOpcode() == ISD::SUB ||
       ShAmt0.getOpcode() == ISD::XOR) {
     Opc = X86ISD::SHRD;
     std::swap(Op0, Op1);
     std::swap(ShAmt0, ShAmt1);
   }
 
   // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
   // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
   // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
   // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
   unsigned Bits = VT.getSizeInBits();
   if (ShAmt1.getOpcode() == ISD::SUB) {
     SDValue Sum = ShAmt1.getOperand(0);
     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
       if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
         return DAG.getNode(Opc, DL, VT,
                            Op0, Op1,
                            DAG.getNode(ISD::TRUNCATE, DL,
                                        MVT::i8, ShAmt0));
     }
   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
     if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
       return DAG.getNode(Opc, DL, VT,
                          N0.getOperand(0), N1.getOperand(0),
                          DAG.getNode(ISD::TRUNCATE, DL,
                                        MVT::i8, ShAmt0));
   } else if (ShAmt1.getOpcode() == ISD::XOR) {
     SDValue Mask = ShAmt1.getOperand(1);
     if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
       unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
       SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
       if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
         ShAmt1Op0 = ShAmt1Op0.getOperand(0);
       if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
         if (Op1.getOpcode() == InnerShift &&
             isa<ConstantSDNode>(Op1.getOperand(1)) &&
             Op1.getConstantOperandVal(1) == 1) {
           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
                              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
         }
         // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
         if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
             Op1.getOperand(0) == Op1.getOperand(1)) {
           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
                      DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
         }
       }
     }
   }
 
   return SDValue();
 }
 
 /// Try to turn tests against the signbit in the form of:
 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
 /// into:
 ///   SETGT(X, -1)
 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
   // This is only worth doing if the output type is i8 or i1.
   EVT ResultType = N->getValueType(0);
   if (ResultType != MVT::i8 && ResultType != MVT::i1)
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // We should be performing an xor against a truncated shift.
   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
     return SDValue();
 
   // Make sure we are performing an xor against one.
   if (!isOneConstant(N1))
     return SDValue();
 
   // SetCC on x86 zero extends so only act on this if it's a logical shift.
   SDValue Shift = N0.getOperand(0);
   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
     return SDValue();
 
   // Make sure we are truncating from one of i16, i32 or i64.
   EVT ShiftTy = Shift.getValueType();
   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
     return SDValue();
 
   // Make sure the shift amount extracts the sign bit.
   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
     return SDValue();
 
   // Create a greater-than comparison against -1.
   // N.B. Using SETGE against 0 works but we want a canonical looking
   // comparison, using SETGT matches up with what TranslateX86CC.
   SDLoc DL(N);
   SDValue ShiftOp = Shift.getOperand(0);
   EVT ShiftOpTy = ShiftOp.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
                                                *DAG.getContext(), ResultType);
   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
   if (SetCCResultType != ResultType)
     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
   return Cond;
 }
 
 /// Turn vector tests of the signbit in the form of:
 ///   xor (sra X, elt_size(X)-1), -1
 /// into:
 ///   pcmpgt X, -1
 ///
 /// This should be called before type legalization because the pattern may not
 /// persist after that.
 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   if (!VT.isSimple())
     return SDValue();
 
   switch (VT.getSimpleVT().SimpleTy) {
   default: return SDValue();
   case MVT::v16i8:
   case MVT::v8i16:
   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
   case MVT::v32i8:
   case MVT::v16i16:
   case MVT::v8i32:
   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
   }
 
   // There must be a shift right algebraic before the xor, and the xor must be a
   // 'not' operation.
   SDValue Shift = N->getOperand(0);
   SDValue Ones = N->getOperand(1);
   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
       !ISD::isBuildVectorAllOnes(Ones.getNode()))
     return SDValue();
 
   // The shift should be smearing the sign bit across each vector element.
   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
   if (!ShiftBV)
     return SDValue();
 
   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
     return SDValue();
 
   // Create a greater-than comparison against -1. We don't use the more obvious
   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
 }
 
 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
 /// is valid for the given \p Subtarget.
 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
                                         const X86Subtarget &Subtarget) {
   if (!Subtarget.hasAVX512())
     return false;
 
   // FIXME: Scalar type may be supported if we move it to vector register.
   if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
     return false;
 
   EVT SrcElVT = SrcVT.getScalarType();
   EVT DstElVT = DstVT.getScalarType();
   if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
     return false;
   if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
     return false;
   if (SrcVT.is512BitVector() || Subtarget.hasVLX())
     return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
   return false;
 }
 
 /// Detect a pattern of truncation with saturation:
 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
 /// Return the source value to be truncated or SDValue() if the pattern was not
 /// matched.
 static SDValue detectUSatPattern(SDValue In, EVT VT) {
   if (In.getOpcode() != ISD::UMIN)
     return SDValue();
 
   //Saturation with truncation. We truncate from InVT to VT.
   assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
     "Unexpected types for truncate operation");
 
   APInt C;
   if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
     // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
     // the element size of the destination type.
     return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
       SDValue();
   }
   return SDValue();
 }
 
 /// Detect a pattern of truncation with saturation:
 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
 /// The types should allow to use VPMOVUS* instruction on AVX512.
 /// Return the source value to be truncated or SDValue() if the pattern was not
 /// matched.
 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
                                        const X86Subtarget &Subtarget) {
   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
     return SDValue();
   return detectUSatPattern(In, VT);
 }
 
 static SDValue
 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
                         const X86Subtarget &Subtarget) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
     return SDValue();
   if (auto USatVal = detectUSatPattern(In, VT))
     if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
   return SDValue();
 }
 
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
 /// X86ISD::AVG instruction.
 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget,
                                 const SDLoc &DL) {
   if (!VT.isVector() || !VT.isSimple())
     return SDValue();
   EVT InVT = In.getValueType();
   unsigned NumElems = VT.getVectorNumElements();
 
   EVT ScalarVT = VT.getVectorElementType();
   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
         isPowerOf2_32(NumElems)))
     return SDValue();
 
   // InScalarVT is the intermediate type in AVG pattern and it should be greater
   // than the original input type (i8/i16).
   EVT InScalarVT = InVT.getVectorElementType();
   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
     return SDValue();
 
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   // Detect the following pattern:
   //
   //   %1 = zext <N x i8> %a to <N x i32>
   //   %2 = zext <N x i8> %b to <N x i32>
   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
   //   %4 = add nuw nsw <N x i32> %3, %2
   //   %5 = lshr <N x i32> %N, <i32 1 x N>
   //   %6 = trunc <N x i32> %5 to <N x i8>
   //
   // In AVX512, the last instruction can also be a trunc store.
   if (In.getOpcode() != ISD::SRL)
     return SDValue();
 
   // A lambda checking the given SDValue is a constant vector and each element
   // is in the range [Min, Max].
   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
     if (!BV || !BV->isConstant())
       return false;
     for (SDValue Op : V->ops()) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
       if (!C)
         return false;
       uint64_t Val = C->getZExtValue();
       if (Val < Min || Val > Max)
         return false;
     }
     return true;
   };
 
   // Split vectors to legal target size and apply AVG.
   auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
     unsigned NumSubs = 1;
     if (Subtarget.hasBWI()) {
       if (VT.getSizeInBits() > 512)
         NumSubs = VT.getSizeInBits() / 512;
     } else if (Subtarget.hasAVX2()) {
       if (VT.getSizeInBits() > 256)
         NumSubs = VT.getSizeInBits() / 256;
     } else {
       if (VT.getSizeInBits() > 128)
         NumSubs = VT.getSizeInBits() / 128;
     }
 
     if (NumSubs == 1)
       return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);
 
     SmallVector<SDValue, 4> Subs;
     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
                                  VT.getVectorNumElements() / NumSubs);
     for (unsigned i = 0; i != NumSubs; ++i) {
       unsigned Idx = i * SubVT.getVectorNumElements();
       SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
       SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
       Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
     }
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
   };
 
   // Check if each element of the vector is left-shifted by one.
   auto LHS = In.getOperand(0);
   auto RHS = In.getOperand(1);
   if (!IsConstVectorInRange(RHS, 1, 1))
     return SDValue();
   if (LHS.getOpcode() != ISD::ADD)
     return SDValue();
 
   // Detect a pattern of a + b + 1 where the order doesn't matter.
   SDValue Operands[3];
   Operands[0] = LHS.getOperand(0);
   Operands[1] = LHS.getOperand(1);
 
   // Take care of the case when one of the operands is a constant vector whose
   // element is in the range [1, 256].
   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
       Operands[0].getOperand(0).getValueType() == VT) {
     // The pattern is detected. Subtract one from the constant vector, then
     // demote it and emit X86ISD::AVG instruction.
     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
     return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
   }
 
   if (Operands[0].getOpcode() == ISD::ADD)
     std::swap(Operands[0], Operands[1]);
   else if (Operands[1].getOpcode() != ISD::ADD)
     return SDValue();
   Operands[2] = Operands[1].getOperand(0);
   Operands[1] = Operands[1].getOperand(1);
 
   // Now we have three operands of two additions. Check that one of them is a
   // constant vector with ones, and the other two are promoted from i8/i16.
   for (int i = 0; i < 3; ++i) {
     if (!IsConstVectorInRange(Operands[i], 1, 1))
       continue;
     std::swap(Operands[i], Operands[2]);
 
     // Check if Operands[0] and Operands[1] are results of type promotion.
     for (int j = 0; j < 2; ++j)
       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
           Operands[j].getOperand(0).getValueType() != VT)
         return SDValue();
 
     // The pattern is detected, emit X86ISD::AVG instruction.
     return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
   }
 
   return SDValue();
 }
 
 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   LoadSDNode *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
   SDLoc dl(Ld);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
   // into two 16-byte operations. Also split non-temporal aligned loads on
   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
   ISD::LoadExtType Ext = Ld->getExtensionType();
   bool Fast;
   unsigned AddressSpace = Ld->getAddressSpace();
   unsigned Alignment = Ld->getAlignment();
   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
       Ext == ISD::NON_EXTLOAD &&
       ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
                                AddressSpace, Alignment, &Fast) && !Fast))) {
     unsigned NumElems = RegVT.getVectorNumElements();
     if (NumElems < 2)
       return SDValue();
 
     SDValue Ptr = Ld->getBasePtr();
 
     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   NumElems/2);
     SDValue Load1 =
         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
                     Alignment, Ld->getMemOperand()->getFlags());
 
     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
     SDValue Load2 =
         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
                     Ld->getPointerInfo().getWithOffset(16),
                     MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                              Load1.getValue(1),
                              Load2.getValue(1));
 
     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
     return DCI.CombineTo(N, NewVec, TF, true);
   }
 
   return SDValue();
 }
 
 /// If V is a build vector of boolean constants and exactly one of those
 /// constants is true, return the operand index of that true element.
 /// Otherwise, return -1.
 static int getOneTrueElt(SDValue V) {
   // This needs to be a build vector of booleans.
   // TODO: Checking for the i1 type matches the IR definition for the mask,
   // but the mask check could be loosened to i8 or other types. That might
   // also require checking more than 'allOnesValue'; eg, the x86 HW
   // instructions only require that the MSB is set for each mask element.
   // The ISD::MSTORE comments/definition do not specify how the mask operand
   // is formatted.
   auto *BV = dyn_cast<BuildVectorSDNode>(V);
   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
     return -1;
 
   int TrueIndex = -1;
   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
   for (unsigned i = 0; i < NumElts; ++i) {
     const SDValue &Op = BV->getOperand(i);
     if (Op.isUndef())
       continue;
     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
     if (!ConstNode)
       return -1;
     if (ConstNode->getAPIntValue().isAllOnesValue()) {
       // If we already found a one, this is too many.
       if (TrueIndex >= 0)
         return -1;
       TrueIndex = i;
     }
   }
   return TrueIndex;
 }
 
 /// Given a masked memory load/store operation, return true if it has one mask
 /// bit set. If it has one mask bit set, then also return the memory address of
 /// the scalar element to load/store, the vector index to insert/extract that
 /// scalar element, and the alignment for the scalar memory access.
 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
                                          SelectionDAG &DAG, SDValue &Addr,
                                          SDValue &Index, unsigned &Alignment) {
   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
   if (TrueMaskElt < 0)
     return false;
 
   // Get the address of the one scalar element that is specified by the mask
   // using the appropriate offset from the base pointer.
   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
   Addr = MaskedOp->getBasePtr();
   if (TrueMaskElt != 0) {
     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
   }
 
   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
   return true;
 }
 
 /// If exactly one element of the mask is set for a non-extending masked load,
 /// it is a scalar load and vector insert.
 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
 /// mask have already been optimized in IR, so we don't bother with those here.
 static SDValue
 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   // However, some target hooks may need to be added to know when the transform
   // is profitable. Endianness would also have to be considered.
 
   SDValue Addr, VecIndex;
   unsigned Alignment;
   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
     return SDValue();
 
   // Load the one scalar element that is specified by the mask using the
   // appropriate offset from the base pointer.
   SDLoc DL(ML);
   EVT VT = ML->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
   SDValue Load =
       DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
                   Alignment, ML->getMemOperand()->getFlags());
 
   // Insert the loaded element into the appropriate place in the vector.
   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
                                Load, VecIndex);
   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
 }
 
 static SDValue
 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI) {
   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
     return SDValue();
 
   SDLoc DL(ML);
   EVT VT = ML->getValueType(0);
 
   // If we are loading the first and last elements of a vector, it is safe and
   // always faster to load the whole vector. Replace the masked load with a
   // vector load and select.
   unsigned NumElts = VT.getVectorNumElements();
   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
   if (LoadFirstElt && LoadLastElt) {
     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
                                 ML->getMemOperand());
     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
   }
 
   // Convert a masked load with a constant mask into a masked load and a select.
   // This allows the select operation to use a faster kind of select instruction
   // (for example, vblendvps -> vblendps).
 
   // Don't try this if the pass-through operand is already undefined. That would
   // cause an infinite loop because that's what we're about to create.
   if (ML->getSrc0().isUndef())
     return SDValue();
 
   // The new masked load has an undef pass-through operand. The select uses the
   // original pass-through operand.
   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
                                     ML->getMask(), DAG.getUNDEF(VT),
                                     ML->getMemoryVT(), ML->getMemOperand(),
                                     ML->getExtensionType());
   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
 
   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
 }
 
 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
 
   // TODO: Expanding load with constant mask may be optimized as well.
   if (Mld->isExpandingLoad())
     return SDValue();
 
   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
       return ScalarLoad;
     // TODO: Do some AVX512 subsets benefit from this transform?
     if (!Subtarget.hasAVX512())
       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
         return Blend;
   }
 
   if (Mld->getExtensionType() != ISD::SEXTLOAD)
     return SDValue();
 
   // Resolve extending loads.
   EVT VT = Mld->getValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   EVT LdVT = Mld->getMemoryVT();
   SDLoc dl(Mld);
 
   assert(LdVT != VT && "Cannot extend to the same type");
   unsigned ToSz = VT.getScalarSizeInBits();
   unsigned FromSz = LdVT.getScalarSizeInBits();
   // From/To sizes and ElemCount must be pow of two.
   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
     "Unexpected size for extending masked load");
 
   unsigned SizeRatio  = ToSz / FromSz;
   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
 
   // Create a type on which we perform the shuffle.
   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
           LdVT.getScalarType(), NumElems*SizeRatio);
   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
   // Convert Src0 value.
   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
   if (!Mld->getSrc0().isUndef()) {
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
            "WideVecVT should be legal");
     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
   }
 
   // Prepare the new mask.
   SDValue NewMask;
   SDValue Mask = Mld->getMask();
   if (Mask.getValueType() == VT) {
     // Mask and original value have the same type.
     NewMask = DAG.getBitcast(WideVecVT, Mask);
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
       ShuffleVec[i] = NumElems * SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
                                    ShuffleVec);
   } else {
     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
     unsigned WidenNumElts = NumElems*SizeRatio;
     unsigned MaskNumElts = VT.getVectorNumElements();
     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
                                      WidenNumElts);
 
     unsigned NumConcat = WidenNumElts / MaskNumElts;
     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
     SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
     Ops[0] = Mask;
     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   }
 
   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
                                      Mld->getBasePtr(), NewMask, WideSrc0,
                                      Mld->getMemoryVT(), Mld->getMemOperand(),
                                      ISD::NON_EXTLOAD);
   SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
 
 /// If exactly one element of the mask is set for a non-truncating masked store,
 /// it is a vector extract and scalar store.
 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
 /// mask have already been optimized in IR, so we don't bother with those here.
 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
                                               SelectionDAG &DAG) {
   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   // However, some target hooks may need to be added to know when the transform
   // is profitable. Endianness would also have to be considered.
 
   SDValue Addr, VecIndex;
   unsigned Alignment;
   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
     return SDValue();
 
   // Extract the one scalar element that is actually being stored.
   SDLoc DL(MS);
   EVT VT = MS->getValue().getValueType();
   EVT EltVT = VT.getVectorElementType();
   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
                                 MS->getValue(), VecIndex);
 
   // Store that element at the appropriate offset from the base pointer.
   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
                       Alignment, MS->getMemOperand()->getFlags());
 }
 
 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
 
   if (Mst->isCompressingStore())
     return SDValue();
 
   if (!Mst->isTruncatingStore()) {
     if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
       return ScalarStore;
 
     // If the mask is checking (0 > X), we're creating a vector with all-zeros
     // or all-ones elements based on the sign bits of X. AVX1 masked store only
     // cares about the sign bit of each mask element, so eliminate the compare:
     // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
     // Note that by waiting to match an x86-specific PCMPGT node, we're
     // eliminating potentially more complex matching of a setcc node which has
     // a full range of predicates.
     SDValue Mask = Mst->getMask();
     if (Mask.getOpcode() == X86ISD::PCMPGT &&
         ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
       assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
              "Unexpected type for PCMPGT");
       return DAG.getMaskedStore(
           Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
           Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
     }
 
     // TODO: AVX512 targets should also be able to simplify something like the
     // pattern above, but that pattern will be different. It will either need to
     // match setcc more generally or match PCMPGTM later (in tablegen?).
 
     return SDValue();
   }
 
   // Resolve truncating stores.
   EVT VT = Mst->getValue().getValueType();
   unsigned NumElems = VT.getVectorNumElements();
   EVT StVT = Mst->getMemoryVT();
   SDLoc dl(Mst);
 
   assert(StVT != VT && "Cannot truncate to the same type");
   unsigned FromSz = VT.getScalarSizeInBits();
   unsigned ToSz = StVT.getScalarSizeInBits();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // The truncating store is legal in some cases. For example
   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
   // are designated for truncate store.
   // In this case we don't need any further transformations.
   if (TLI.isTruncStoreLegal(VT, StVT))
     return SDValue();
 
   // From/To sizes and ElemCount must be pow of two.
   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
     "Unexpected size for truncating masked store");
   // We are going to use the original vector elt for storing.
   // Accumulated smaller vector elements must be a multiple of the store size.
   assert (((NumElems * FromSz) % ToSz) == 0 &&
           "Unexpected ratio for truncating masked store");
 
   unsigned SizeRatio  = FromSz / ToSz;
   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
 
   // Create a type on which we perform the shuffle.
   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
           StVT.getScalarType(), NumElems*SizeRatio);
 
   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   for (unsigned i = 0; i != NumElems; ++i)
     ShuffleVec[i] = i * SizeRatio;
 
   // Can't shuffle using an illegal type.
   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
          "WideVecVT should be legal");
 
   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                               DAG.getUNDEF(WideVecVT),
                                               ShuffleVec);
 
   SDValue NewMask;
   SDValue Mask = Mst->getMask();
   if (Mask.getValueType() == VT) {
     // Mask and original value have the same type.
     NewMask = DAG.getBitcast(WideVecVT, Mask);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
       ShuffleVec[i] = NumElems*SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
                                    ShuffleVec);
   } else {
     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
     unsigned WidenNumElts = NumElems*SizeRatio;
     unsigned MaskNumElts = VT.getVectorNumElements();
     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
                                      WidenNumElts);
 
     unsigned NumConcat = WidenNumElts / MaskNumElts;
     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
     SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
     Ops[0] = Mask;
     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   }
 
   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
                             Mst->getBasePtr(), NewMask, StVT,
                             Mst->getMemOperand(), false);
 }
 
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   StoreSDNode *St = cast<StoreSDNode>(N);
   EVT VT = St->getValue().getValueType();
   EVT StVT = St->getMemoryVT();
   SDLoc dl(St);
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If we are saving a concatenation of two XMM registers and 32-byte stores
   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
   bool Fast;
   unsigned AddressSpace = St->getAddressSpace();
   unsigned Alignment = St->getAlignment();
   if (VT.is256BitVector() && StVT == VT &&
       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
                              AddressSpace, Alignment, &Fast) &&
       !Fast) {
     unsigned NumElems = VT.getVectorNumElements();
     if (NumElems < 2)
       return SDValue();
 
     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
 
     SDValue Ptr0 = St->getBasePtr();
     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
 
     SDValue Ch0 =
         DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
                      Alignment, St->getMemOperand()->getFlags());
     SDValue Ch1 =
         DAG.getStore(St->getChain(), dl, Value1, Ptr1,
                      St->getPointerInfo().getWithOffset(16),
                      MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   }
 
   // Optimize trunc store (of multiple scalars) to shuffle and store.
   // First, pack all of the elements in one place. Next, store to memory
   // in fewer chunks.
   if (St->isTruncatingStore() && VT.isVector()) {
     // Check if we can detect an AVG pattern from the truncation. If yes,
     // replace the trunc store by a normal store with the result of X86ISD::AVG
     // instruction.
     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
                                        Subtarget, dl))
       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
                           St->getPointerInfo(), St->getAlignment(),
                           St->getMemOperand()->getFlags());
 
     if (SDValue Val =
         detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
                              dl, Val, St->getBasePtr(),
                              St->getMemoryVT(), St->getMemOperand(), DAG);
 
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
     unsigned FromSz = VT.getScalarSizeInBits();
     unsigned ToSz = StVT.getScalarSizeInBits();
 
     // The truncating store is legal in some cases. For example
     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
     // are designated for truncate store.
     // In this case we don't need any further transformations.
     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
       return SDValue();
 
     // From, To sizes and ElemCount must be pow of two
     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
     // We are going to use the original vector elt for storing.
     // Accumulated smaller vector elements must be a multiple of the store size.
     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
 
     unsigned SizeRatio  = FromSz / ToSz;
 
     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
 
     // Create a type on which we perform the shuffle
     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
             StVT.getScalarType(), NumElems*SizeRatio);
 
     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     if (!TLI.isTypeLegal(WideVecVT))
       return SDValue();
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                          DAG.getUNDEF(WideVecVT),
                                          ShuffleVec);
     // At this point all of the data is stored at the bottom of the
     // register. We now need to save it to mem.
 
     // Find the largest store unit
     MVT StoreType = MVT::i8;
     for (MVT Tp : MVT::integer_valuetypes()) {
       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
         StoreType = Tp;
     }
 
     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
         (64 <= NumElems * ToSz))
       StoreType = MVT::f64;
 
     // Bitcast the original vector into a vector of store-size units
     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
     SDValue Ptr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                                    StoreType, ShuffWide,
                                    DAG.getIntPtrConstant(i, dl));
       SDValue Ch =
           DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
                        St->getAlignment(), St->getMemOperand()->getFlags());
       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
       Chains.push_back(Ch);
     }
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   }
 
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
   // places to insert EMMS.  This qualifies as a quick hack.
 
   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   if (VT.getSizeInBits() != 64)
     return SDValue();
 
   const Function &F = DAG.getMachineFunction().getFunction();
   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
   bool F64IsLegal =
       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
   if ((VT.isVector() ||
        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
       isa<LoadSDNode>(St->getValue()) &&
       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
       St->getChain().hasOneUse() && !St->isVolatile()) {
     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
     SmallVector<SDValue, 8> Ops;
 
     if (!ISD::isNormalLoad(Ld))
       return SDValue();
 
     // If this is not the MMX case, i.e. we are just turning i64 load/store
     // into f64 load/store, avoid the transformation if there are multiple
     // uses of the loaded value.
     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
       return SDValue();
 
     SDLoc LdDL(Ld);
     SDLoc StDL(N);
     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
     if (Subtarget.is64Bit() || F64IsLegal) {
       MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getMemOperand());
 
       // Make sure new load is placed in same chain order.
       DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
       return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
                           St->getMemOperand());
     }
 
     // Otherwise, lower to two pairs of 32-bit loads / stores.
     SDValue LoAddr = Ld->getBasePtr();
     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
 
     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
                                Ld->getPointerInfo(), Ld->getAlignment(),
                                Ld->getMemOperand()->getFlags());
     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
                                Ld->getPointerInfo().getWithOffset(4),
                                MinAlign(Ld->getAlignment(), 4),
                                Ld->getMemOperand()->getFlags());
     // Make sure new loads are placed in same chain order.
     DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
     DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
 
     LoAddr = St->getBasePtr();
     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
 
     SDValue LoSt =
         DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
                      St->getAlignment(), St->getMemOperand()->getFlags());
     SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
                                 St->getPointerInfo().getWithOffset(4),
                                 MinAlign(St->getAlignment(), 4),
                                 St->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   }
 
   // This is similar to the above case, but here we handle a scalar 64-bit
   // integer store that is extracted from a vector on a 32-bit target.
   // If we have SSE2, then we can treat it like a floating-point double
   // to get past legalization. The execution dependencies fixup pass will
   // choose the optimal machine instruction for the store if this really is
   // an integer or v2f32 rather than an f64.
   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     SDValue OldExtract = St->getOperand(1);
     SDValue ExtOp0 = OldExtract.getOperand(0);
     unsigned VecSize = ExtOp0.getValueSizeInBits();
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                      BitCast, OldExtract.getOperand(1));
     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
                         St->getPointerInfo(), St->getAlignment(),
                         St->getMemOperand()->getFlags());
   }
 
   return SDValue();
 }
 
 /// Return 'true' if this vector operation is "horizontal"
 /// and return the operands for the horizontal operation in LHS and RHS.  A
 /// horizontal operation performs the binary operation on successive elements
 /// of its first operand, then on successive elements of its second operand,
 /// returning the resulting values in a vector.  For example, if
 ///   A = < float a0, float a1, float a2, float a3 >
 /// and
 ///   B = < float b0, float b1, float b2, float b3 >
 /// then the result of doing a horizontal operation on A and B is
 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
 /// A horizontal-op B, for some already available A and B, and if so then LHS is
 /// set to A, RHS to B, and the routine returns 'true'.
 /// Note that the binary operation should have the property that if one of the
 /// operands is UNDEF then the result is UNDEF.
 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   // Look for the following pattern: if
   //   A = < float a0, float a1, float a2, float a3 >
   //   B = < float b0, float b1, float b2, float b3 >
   // and
   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
   // which is A horizontal-op B.
 
   // At least one of the operands should be a vector shuffle.
   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
     return false;
 
   MVT VT = LHS.getSimpleValueType();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for horizontal add/sub");
 
   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
   // operate independently on 128-bit lanes.
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts / NumLanes;
   assert((NumLaneElts % 2 == 0) &&
          "Vector type should have an even number of elements in each lane");
   unsigned HalfLaneElts = NumLaneElts/2;
 
   // View LHS in the form
   //   LHS = VECTOR_SHUFFLE A, B, LMask
   // If LHS is not a shuffle then pretend it is the shuffle
   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
   // type VT.
   SDValue A, B;
   SmallVector<int, 16> LMask(NumElts);
   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
     if (!LHS.getOperand(0).isUndef())
       A = LHS.getOperand(0);
     if (!LHS.getOperand(1).isUndef())
       B = LHS.getOperand(1);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
     std::copy(Mask.begin(), Mask.end(), LMask.begin());
   } else {
     if (!LHS.isUndef())
       A = LHS;
     for (unsigned i = 0; i != NumElts; ++i)
       LMask[i] = i;
   }
 
   // Likewise, view RHS in the form
   //   RHS = VECTOR_SHUFFLE C, D, RMask
   SDValue C, D;
   SmallVector<int, 16> RMask(NumElts);
   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
     if (!RHS.getOperand(0).isUndef())
       C = RHS.getOperand(0);
     if (!RHS.getOperand(1).isUndef())
       D = RHS.getOperand(1);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
     std::copy(Mask.begin(), Mask.end(), RMask.begin());
   } else {
     if (!RHS.isUndef())
       C = RHS;
     for (unsigned i = 0; i != NumElts; ++i)
       RMask[i] = i;
   }
 
   // Check that the shuffles are both shuffling the same vectors.
   if (!(A == C && B == D) && !(A == D && B == C))
     return false;
 
   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
   if (!A.getNode() && !B.getNode())
     return false;
 
   // If A and B occur in reverse order in RHS, then "swap" them (which means
   // rewriting the mask).
   if (A != C)
     ShuffleVectorSDNode::commuteMask(RMask);
 
   // At this point LHS and RHS are equivalent to
   //   LHS = VECTOR_SHUFFLE A, B, LMask
   //   RHS = VECTOR_SHUFFLE A, B, RMask
   // Check that the masks correspond to performing a horizontal operation.
   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
     for (unsigned i = 0; i != NumLaneElts; ++i) {
       int LIdx = LMask[i+l], RIdx = RMask[i+l];
 
       // Ignore any UNDEF components.
       if (LIdx < 0 || RIdx < 0 ||
           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
         continue;
 
       // Check that successive elements are being operated on.  If not, this is
       // not a horizontal operation.
       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
       if (!(LIdx == Index && RIdx == Index + 1) &&
           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
         return false;
     }
   }
 
   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
   return true;
 }
 
 /// Do target-specific dag combines on floating-point adds/subs.
 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   bool IsFadd = N->getOpcode() == ISD::FADD;
   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
 
   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, IsFadd)) {
     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
   }
   return SDValue();
 }
 
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
 /// the codegen.
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget,
                                           SDLoc &DL) {
   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
   SDValue Src = N->getOperand(0);
   unsigned Opcode = Src.getOpcode();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   EVT VT = N->getValueType(0);
   EVT SrcVT = Src.getValueType();
 
   auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
 
     // Repeated operand, so we are only trading one output truncation for
     // one input truncation.
     if (Op0 == Op1)
       return true;
 
     // See if either operand has been extended from a smaller/equal size to
     // the truncation size, allowing a truncation to combine with the extend.
     unsigned Opcode0 = Op0.getOpcode();
     if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
          Opcode0 == ISD::ZERO_EXTEND) &&
         Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
       return true;
 
     unsigned Opcode1 = Op1.getOpcode();
     if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
          Opcode1 == ISD::ZERO_EXTEND) &&
         Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
       return true;
 
     // See if either operand is a single use constant which can be constant
     // folded.
     SDValue BC0 = peekThroughOneUseBitcasts(Op0);
     SDValue BC1 = peekThroughOneUseBitcasts(Op1);
     return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
            ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
   };
 
   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
     return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
   };
 
   // Don't combine if the operation has other uses.
   if (!N->isOnlyUserOf(Src.getNode()))
     return SDValue();
 
   // Only support vector truncation for now.
   // TODO: i64 scalar math would benefit as well.
   if (!VT.isVector())
     return SDValue();
 
   // In most cases its only worth pre-truncating if we're only facing the cost
   // of one truncation.
   // i.e. if one of the inputs will constant fold or the input is repeated.
   switch (Opcode) {
   case ISD::AND:
   case ISD::XOR:
   case ISD::OR: {
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
         IsRepeatedOpOrFreeTruncation(Op0, Op1))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
 
   case ISD::MUL:
     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
     // better to truncate if we have the chance.
     if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
         !Subtarget.hasDQI())
       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
     LLVM_FALLTHROUGH;
   case ISD::ADD: {
     // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegal(Opcode, VT) &&
         IsRepeatedOpOrFreeTruncation(Op0, Op1))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
   }
 
   return SDValue();
 }
 
 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
 static SDValue
 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
                                   SmallVector<SDValue, 8> &Regs) {
   assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
                              Regs[0].getValueType() == MVT::v2i64));
   EVT OutVT = N->getValueType(0);
   EVT OutSVT = OutVT.getVectorElementType();
   EVT InVT = Regs[0].getValueType();
   EVT InSVT = InVT.getVectorElementType();
   SDLoc DL(N);
 
   // First, use mask to unset all bits that won't appear in the result.
   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
          "OutSVT can only be either i8 or i16.");
   APInt Mask =
       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
   SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
   for (auto &Reg : Regs)
     Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
 
   MVT UnpackedVT, PackedVT;
   if (OutSVT == MVT::i8) {
     UnpackedVT = MVT::v8i16;
     PackedVT = MVT::v16i8;
   } else {
     UnpackedVT = MVT::v4i32;
     PackedVT = MVT::v8i16;
   }
 
   // In each iteration, truncate the type by a half size.
   auto RegNum = Regs.size();
   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
        j < e; j *= 2, RegNum /= 2) {
     for (unsigned i = 0; i < RegNum; i++)
       Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
     for (unsigned i = 0; i < RegNum / 2; i++)
       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
                             Regs[i * 2 + 1]);
   }
 
   // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
   // then extract a subvector as the result since v8i8 is not a legal type.
   if (OutVT == MVT::v8i8) {
     Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
     Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
                           DAG.getIntPtrConstant(0, DL));
     return Regs[0];
   } else if (RegNum > 1) {
     Regs.resize(RegNum);
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
   } else
     return Regs[0];
 }
 
 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
 static SDValue
 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG,
                                   SmallVector<SDValue, 8> &Regs) {
   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
   EVT OutVT = N->getValueType(0);
   SDLoc DL(N);
 
   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
   for (auto &Reg : Regs) {
     Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
                               Subtarget, DAG);
     Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
                               Subtarget, DAG);
   }
 
   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
     Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
                           Regs[i * 2 + 1]);
 
   if (Regs.size() > 2) {
     Regs.resize(Regs.size() / 2);
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
   } else
     return Regs[0];
 }
 
 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
 /// legalization the truncation will be translated into a BUILD_VECTOR with each
 /// element that is extracted from a vector and then truncated, and it is
 /// difficult to do this optimization based on them.
 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget) {
   EVT OutVT = N->getValueType(0);
   if (!OutVT.isVector())
     return SDValue();
 
   SDValue In = N->getOperand(0);
   if (!In.getValueType().isSimple())
     return SDValue();
 
   EVT InVT = In.getValueType();
   unsigned NumElems = OutVT.getVectorNumElements();
 
   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
   // SSE2, and we need to take care of it specially.
   // AVX512 provides vpmovdb.
   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
     return SDValue();
 
   EVT OutSVT = OutVT.getVectorElementType();
   EVT InSVT = InVT.getVectorElementType();
   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
         NumElems >= 8))
     return SDValue();
 
   // SSSE3's pshufb results in less instructions in the cases below.
   if (Subtarget.hasSSSE3() && NumElems == 8 &&
       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
     return SDValue();
 
   SDLoc DL(N);
 
   // Split a long vector into vectors of legal type.
   unsigned RegNum = InVT.getSizeInBits() / 128;
   SmallVector<SDValue, 8> SubVec(RegNum);
   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
 
   for (unsigned i = 0; i < RegNum; i++)
     SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
                             DAG.getIntPtrConstant(i * NumSubRegElts, DL));
 
   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
   // truncate 2 x v4i32 to v8i16.
   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
   else if (InSVT == MVT::i32)
     return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
   else
     return SDValue();
 }
 
 /// This function transforms vector truncation of 'extended sign-bits' or
 /// 'extended zero-bits' values.
 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
                                                SelectionDAG &DAG,
                                                const X86Subtarget &Subtarget) {
   // Requires SSE2 but AVX512 has fast truncate.
   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
     return SDValue();
 
   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
     return SDValue();
 
   SDValue In = N->getOperand(0);
   if (!In.getValueType().isSimple())
     return SDValue();
 
   MVT VT = N->getValueType(0).getSimpleVT();
   MVT SVT = VT.getScalarType();
 
   MVT InVT = In.getValueType().getSimpleVT();
   MVT InSVT = InVT.getScalarType();
 
   // Check we have a truncation suited for PACKSS.
   if (!VT.is128BitVector() && !VT.is256BitVector())
     return SDValue();
   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
     return SDValue();
   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
     return SDValue();
 
   // Use PACKSS if the input has sign-bits that extend all the way to the
   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
   unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
   if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
 
   // Use PACKUS if the input has zero-bits that extend all the way to the
   // packed/truncated value. e.g. masks, zext_in_reg, etc.
   KnownBits Known;
   DAG.computeKnownBits(In, Known);
   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
   NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
   if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
 
   return SDValue();
 }
 
 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
   SDLoc DL(N);
 
   // Attempt to pre-truncate inputs to arithmetic ops instead.
   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
     return V;
 
   // Try to detect AVG pattern first.
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
   // Try to combine truncation with unsigned saturation.
   if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
     return Val;
 
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
     SDValue BCSrc = Src.getOperand(0);
     if (BCSrc.getValueType() == MVT::x86mmx)
       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   }
 
   // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
     return V;
 
   return combineVectorTruncation(N, DAG, Subtarget);
 }
 
 /// Returns the negated value if the node \p N flips sign of FP value.
 ///
 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
 /// AVX512F does not have FXOR, so FNEG is lowered as
 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
 /// In this case we go though all bitcasts.
 static SDValue isFNEG(SDNode *N) {
   if (N->getOpcode() == ISD::FNEG)
     return N->getOperand(0);
 
   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
   if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
     return SDValue();
 
   SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
   if (!Op1.getValueType().isFloatingPoint())
     return SDValue();
 
   SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
 
   unsigned EltBits = Op1.getScalarValueSizeInBits();
   auto isSignMask = [&](const ConstantFP *C) {
     return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
   };
 
   // There is more than one way to represent the same constant on
   // the different X86 targets. The type of the node may also depend on size.
   //  - load scalar value and broadcast
   //  - BUILD_VECTOR node
   //  - load from a constant pool.
   // We check all variants here.
   if (Op1.getOpcode() == X86ISD::VBROADCAST) {
     if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
       if (isSignMask(cast<ConstantFP>(C)))
         return Op0;
 
   } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
     if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
       if (isSignMask(CN->getConstantFPValue()))
         return Op0;
 
   } else if (auto *C = getTargetConstantFromNode(Op1)) {
     if (C->getType()->isVectorTy()) {
       if (auto *SplatV = C->getSplatValue())
         if (isSignMask(cast<ConstantFP>(SplatV)))
           return Op0;
     } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
       if (isSignMask(FPConst))
         return Op0;
   }
   return SDValue();
 }
 
 /// Do target-specific dag combines on floating point negations.
 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   EVT OrigVT = N->getValueType(0);
   SDValue Arg = isFNEG(N);
   assert(Arg.getNode() && "N is expected to be an FNEG node");
 
   EVT VT = Arg.getValueType();
   EVT SVT = VT.getScalarType();
   SDLoc DL(N);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   // If we're negating a FMUL node on a target with FMA, then we can avoid the
   // use of a constant by performing (-0 - A*B) instead.
   // FIXME: Check rounding control flags as well once it becomes available.
   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
                                   Arg.getOperand(1), Zero);
     return DAG.getBitcast(OrigVT, NewNode);
   }
 
   // If we're negating an FMA node, then we can adjust the
   // instruction to include the extra negation.
   unsigned NewOpcode = 0;
   if (Arg.hasOneUse()) {
     switch (Arg.getOpcode()) {
     case ISD::FMA:             NewOpcode = X86ISD::FNMSUB;       break;
     case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
     case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
     case X86ISD::FNMSUB:       NewOpcode = ISD::FMA;             break;
     case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
     case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
     case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
     case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
     // We can't handle scalar intrinsic node here because it would only
     // invert one element and not the whole vector. But we could try to handle
     // a negation of the lower element only.
     }
   }
   if (NewOpcode)
     return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
                                               Arg.getNode()->ops()));
 
   return SDValue();
 }
 
 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
   // If we have integer vector types available, use the integer opcodes.
   if (VT.isVector() && Subtarget.hasSSE2()) {
     SDLoc dl(N);
 
     MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
 
     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
     unsigned IntOpcode;
     switch (N->getOpcode()) {
     default: llvm_unreachable("Unexpected FP logic op");
     case X86ISD::FOR: IntOpcode = ISD::OR; break;
     case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
     case X86ISD::FAND: IntOpcode = ISD::AND; break;
     case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
     }
     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
     return DAG.getBitcast(VT, IntOp);
   }
   return SDValue();
 }
 
 
 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() != ISD::XOR)
     return SDValue();
 
   SDValue LHS = N->getOperand(0);
   auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
     return SDValue();
 
   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
       X86::CondCode(LHS->getConstantOperandVal(0)));
   SDLoc DL(N);
   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
 }
 
 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   // If this is SSE1 only convert to FXOR to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
       N->getValueType(0) == MVT::v4i32) {
     return DAG.getBitcast(
         MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
   }
 
   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
     return Cmp;
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue SetCC = foldXor1SetCC(N, DAG))
     return SetCC;
 
   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
     return RV;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
   if (isFNEG(N))
     return combineFneg(N, DAG, Subtarget);
   return SDValue();
 }
 
 
 static bool isNullFPScalarOrVectorConst(SDValue V) {
   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
 }
 
 /// If a value is a scalar FP zero or a vector FP zero (potentially including
 /// undefined elements), return a zero constant that may be used to fold away
 /// that value. In the case of a vector, the returned constant will not contain
 /// undefined elements even if the input parameter does. This makes it suitable
 /// to be used as a replacement operand with operations (eg, bitwise-and) where
 /// an undef should not propagate.
 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   if (!isNullFPScalarOrVectorConst(V))
     return SDValue();
 
   if (V.getValueType().isVector())
     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
 
   return V;
 }
 
 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
     return SDValue();
 
   auto isAllOnesConstantFP = [](SDValue V) {
     if (V.getSimpleValueType().isVector())
       return ISD::isBuildVectorAllOnes(V.getNode());
     auto *C = dyn_cast<ConstantFPSDNode>(V);
     return C && C->getConstantFPValue()->isAllOnesValue();
   };
 
   // fand (fxor X, -1), Y --> fandn X, Y
   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
 
   // fand X, (fxor Y, -1) --> fandn Y, X
   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
 
   return SDValue();
 }
 
 /// Do target-specific dag combines on X86ISD::FAND nodes.
 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   // FAND(0.0, x) -> 0.0
   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
     return V;
 
   // FAND(x, 0.0) -> 0.0
   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
     return V;
 
   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
     return V;
 
   return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FANDN nodes.
 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   // FANDN(0.0, x) -> x
   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
     return N->getOperand(1);
 
   // FANDN(x, 0.0) -> 0.0
   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
     return V;
 
   return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
 
   // F[X]OR(0.0, x) -> x
   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
     return N->getOperand(1);
 
   // F[X]OR(x, 0.0) -> x
   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
     return N->getOperand(0);
 
   if (isFNEG(N))
     if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
       return NewVal;
 
   return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
 
   // Only perform optimizations if UnsafeMath is used.
   if (!DAG.getTarget().Options.UnsafeFPMath)
     return SDValue();
 
   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
   // into FMINC and FMAXC, which are Commutative operations.
   unsigned NewOp = 0;
   switch (N->getOpcode()) {
     default: llvm_unreachable("unknown opcode");
     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   }
 
   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
                      N->getOperand(0), N->getOperand(1));
 }
 
 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   if (Subtarget.useSoftFloat())
     return SDValue();
 
   // TODO: Check for global or instruction-level "nnan". In that case, we
   //       should be able to lower to FMAX/FMIN alone.
   // TODO: If an operand is already known to be a NaN or not a NaN, this
   //       should be an optional swap and FMAX/FMIN.
 
   EVT VT = N->getValueType(0);
   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
     return SDValue();
 
   // This takes at least 3 instructions, so favor a library call when operating
   // on a scalar and minimizing code size.
   if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDLoc DL(N);
   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
       DAG.getDataLayout(), *DAG.getContext(), VT);
 
   // There are 4 possibilities involving NaN inputs, and these are the required
   // outputs:
   //                   Op1
   //               Num     NaN
   //            ----------------
   //       Num  |  Max  |  Op0 |
   // Op0        ----------------
   //       NaN  |  Op1  |  NaN |
   //            ----------------
   //
   // The SSE FP max/min instructions were not designed for this case, but rather
   // to implement:
   //   Min = Op1 < Op0 ? Op1 : Op0
   //   Max = Op1 > Op0 ? Op1 : Op0
   //
   // So they always return Op0 if either input is a NaN. However, we can still
   // use those instructions for fmaxnum by selecting away a NaN input.
 
   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
 
   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   // are NaN, the NaN value of Op1 is the result.
   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
 }
 
 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
   // ANDNP(0, x) -> x
   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
     return N->getOperand(1);
 
   // ANDNP(x, 0) -> 0
   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
     return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
 
   EVT VT = N->getValueType(0);
 
   // Attempt to recursively combine a bitmask ANDNP with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
       DCI.CombineTo(N, Res);
       return SDValue();
     }
   }
 
   return SDValue();
 }
 
 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // BT ignores high bits in the bit index operand.
   unsigned BitWidth = N1.getValueSizeInBits();
   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
   if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
     return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
 
   return SDValue();
 }
 
 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   if (!VT.isVector())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
   SDLoc dl(N);
 
   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   // both SSE and AVX2 since there is no sign-extended shift right
   // operation on a vector with 64-bit elements.
   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
 
     // EXTLOAD has a better solution on AVX2,
     // it may be replaced with X86ISD::VSEXT node.
     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
       if (!ISD::isNormalLoad(N00.getNode()))
         return SDValue();
 
     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
                                   N00, N1);
       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
     }
   }
   return SDValue();
 }
 
 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
 /// opportunities to combine math ops, use an LEA, or use a complex addressing
 /// mode. This can eliminate extend, add, and shift instructions.
 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
       Ext->getOpcode() != ISD::ZERO_EXTEND)
     return SDValue();
 
   // TODO: This should be valid for other integer types.
   EVT VT = Ext->getValueType(0);
   if (VT != MVT::i64)
     return SDValue();
 
   SDValue Add = Ext->getOperand(0);
   if (Add.getOpcode() != ISD::ADD)
     return SDValue();
 
   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
   bool NSW = Add->getFlags().hasNoSignedWrap();
   bool NUW = Add->getFlags().hasNoUnsignedWrap();
 
   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
   // into the 'zext'
   if ((Sext && !NSW) || (!Sext && !NUW))
     return SDValue();
 
   // Having a constant operand to the 'add' ensures that we are not increasing
   // the instruction count because the constant is extended for free below.
   // A constant operand can also become the displacement field of an LEA.
   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
   if (!AddOp1)
     return SDValue();
 
   // Don't make the 'add' bigger if there's no hope of combining it with some
   // other 'add' or 'shl' instruction.
   // TODO: It may be profitable to generate simpler LEA instructions in place
   // of single 'add' instructions, but the cost model for selecting an LEA
   // currently has a high threshold.
   bool HasLEAPotential = false;
   for (auto *User : Ext->uses()) {
     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
       HasLEAPotential = true;
       break;
     }
   }
   if (!HasLEAPotential)
     return SDValue();
 
   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
   SDValue AddOp0 = Add.getOperand(0);
   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
 
   // The wider add is guaranteed to not wrap because both operands are
   // sign-extended.
   SDNodeFlags Flags;
   Flags.setNoSignedWrap(NSW);
   Flags.setNoUnsignedWrap(NUW);
   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
 }
 
 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
 /// extends from AH (which we otherwise need to do contortions to access).
 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   auto OpcodeN = N->getOpcode();
   auto OpcodeN0 = N0.getOpcode();
   if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
         (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
     return SDValue();
 
   EVT VT = N->getValueType(0);
   EVT InVT = N0.getValueType();
   if (N0.getResNo() != 1 || InVT != MVT::i8 ||
       !(VT == MVT::i32 || VT == MVT::i64))
     return SDValue();
 
   SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
   auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
                                                : X86ISD::UDIVREM8_ZEXT_HREG;
   SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
                           N0.getOperand(1));
   DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
   // If this was a 64-bit extend, complete it.
   if (VT == MVT::i64)
     return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
   return R.getValue(1);
 }
 
 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
 // operands and the result of CMOV is not used anywhere else - promote CMOV
 // itself instead of promoting its result. This could be beneficial, because:
 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
 //        (or more) pseudo-CMOVs only when they go one-after-another and
 //        getting rid of result extension code after CMOV will help that.
 //     2) Promotion of constant CMOV arguments is free, hence the
 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
 //        promotion is also good in terms of code-size.
 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
 //         promotion).
 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
   SDValue CMovN = Extend->getOperand(0);
   if (CMovN.getOpcode() != X86ISD::CMOV)
     return SDValue();
 
   EVT TargetVT = Extend->getValueType(0);
   unsigned ExtendOpcode = Extend->getOpcode();
   SDLoc DL(Extend);
 
   EVT VT = CMovN.getValueType();
   SDValue CMovOp0 = CMovN.getOperand(0);
   SDValue CMovOp1 = CMovN.getOperand(1);
 
   bool DoPromoteCMOV =
       (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
       CMovN.hasOneUse() &&
       (isa<ConstantSDNode>(CMovOp0.getNode()) &&
        isa<ConstantSDNode>(CMovOp1.getNode()));
 
   if (!DoPromoteCMOV)
     return SDValue();
 
   CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
   CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
 
   return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
                      CMovN.getOperand(2), CMovN.getOperand(3));
 }
 
 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
 // This is more or less the reverse of combineBitcastvxi1.
 static SDValue
 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
                                TargetLowering::DAGCombinerInfo &DCI,
                                const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
       Opcode != ISD::ANY_EXTEND)
     return SDValue();
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SVT = VT.getScalarType();
   EVT InSVT = N0.getValueType().getScalarType();
   unsigned EltSizeInBits = SVT.getSizeInBits();
 
   // Input type must be extending a bool vector (bit-casted from a scalar
   // integer) to legal integer types.
   if (!VT.isVector())
     return SDValue();
   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
     return SDValue();
   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
   EVT SclVT = N0.getOperand(0).getValueType();
   if (!SclVT.isScalarInteger())
     return SDValue();
 
   SDLoc DL(N);
   SDValue Vec;
   SmallVector<int, 32> ShuffleMask;
   unsigned NumElts = VT.getVectorNumElements();
   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
 
   // Broadcast the scalar integer to the vector elements.
   if (NumElts > EltSizeInBits) {
     // If the scalar integer is greater than the vector element size, then we
     // must split it down into sub-sections for broadcasting. For example:
     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
     unsigned Scale = NumElts / EltSizeInBits;
     EVT BroadcastVT =
         EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
     Vec = DAG.getBitcast(VT, Vec);
 
     for (unsigned i = 0; i != Scale; ++i)
       ShuffleMask.append(EltSizeInBits, i);
   } else {
     // For smaller scalar integers, we can simply any-extend it to the vector
     // element size (we don't care about the upper bits) and broadcast it to all
     // elements.
     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
     ShuffleMask.append(NumElts, 0);
   }
   Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
 
   // Now, mask the relevant bit in each element.
   SmallVector<SDValue, 32> Bits;
   for (unsigned i = 0; i != NumElts; ++i) {
     int BitIdx = (i % EltSizeInBits);
     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
   }
   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
 
   // Compare against the bitmask and extend the result.
   EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
 
   // For SEXT, this is now done, otherwise shift the result down for
   // zero-extension.
   if (Opcode == ISD::SIGN_EXTEND)
     return Vec;
   return DAG.getNode(ISD::SRL, DL, VT, Vec,
                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
 }
 
 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
 /// with UNDEFs) of the input to vectors of the same size as the target type
 /// which then extends the lowest elements.
 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
     return SDValue();
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SVT = VT.getScalarType();
   EVT InVT = N0.getValueType();
   EVT InSVT = InVT.getScalarType();
 
   // Input type must be a vector and we must be extending legal integer types.
   if (!VT.isVector())
     return SDValue();
   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
     return SDValue();
   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
     return SDValue();
 
   // On AVX2+ targets, if the input/output types are both legal then we will be
   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
   if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
     return SDValue();
 
   SDLoc DL(N);
 
   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
     EVT InVT = N.getValueType();
     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
                                  Size / InVT.getScalarSizeInBits());
     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
                                   DAG.getUNDEF(InVT));
     Opnds[0] = N;
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
   };
 
   // If target-size is less than 128-bits, extend to a type that would extend
   // to 128 bits, extend that and extract the original target vector.
   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
     unsigned Scale = 128 / VT.getSizeInBits();
     EVT ExVT =
         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
   // Also use this if we don't have SSE41 to allow the legalizer do its job.
   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
       (VT.is256BitVector() && Subtarget.hasInt256()) ||
       (VT.is512BitVector() && Subtarget.hasAVX512())) {
     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
     return Opcode == ISD::SIGN_EXTEND
                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
                : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
   }
 
   auto SplitAndExtendInReg = [&](unsigned SplitSize) {
     unsigned NumVecs = VT.getSizeInBits() / SplitSize;
     unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
 
     SmallVector<SDValue, 8> Opnds;
     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
                                    DAG.getIntPtrConstant(Offset, DL));
       SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
       SrcVec = Opcode == ISD::SIGN_EXTEND
                    ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
                    : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
       Opnds.push_back(SrcVec);
     }
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
   };
 
   // On pre-AVX2 targets, split into 128-bit nodes of
   // ISD::*_EXTEND_VECTOR_INREG.
   if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
     return SplitAndExtendInReg(128);
 
   // On pre-AVX512 targets, split into 256-bit nodes of
   // ISD::*_EXTEND_VECTOR_INREG.
   if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
     return SplitAndExtendInReg(256);
 
   return SDValue();
 }
 
 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = N0.getValueType();
   SDLoc DL(N);
 
   if (SDValue DivRem8 = getDivRem8(N, DAG))
     return DivRem8;
 
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
       isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
     // Invert and sign-extend a boolean is the same as zero-extend and subtract
     // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
     // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
     // sext (xor Bool, -1) --> sub (zext Bool), 1
     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
     return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
   }
 
   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
   if (VT.isVector())
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
     return NewAdd;
 
   return SDValue();
 }
 
 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   EVT ScalarVT = VT.getScalarType();
   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
     return SDValue();
 
   SDValue A = N->getOperand(0);
   SDValue B = N->getOperand(1);
   SDValue C = N->getOperand(2);
 
   auto invertIfNegative = [](SDValue &V) {
     if (SDValue NegVal = isFNEG(V.getNode())) {
       V = NegVal;
       return true;
     }
     return false;
   };
 
   // Do not convert the passthru input of scalar intrinsics.
   // FIXME: We could allow negations of the lower element only.
   bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
               N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
   bool NegB = invertIfNegative(B);
   bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
               N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
 
   // Negative multiplication when NegA xor NegB
   bool NegMul = (NegA != NegB);
   bool HasNeg = NegA || NegB || NegC;
 
   unsigned NewOpcode;
   if (!NegMul)
     NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
   else
     NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
 
   // For FMA, we risk reconstructing the node we started with.
   // In order to avoid this, we check for negation or opcode change. If
   // one of the two happened, then it is a new node and we return it.
   if (N->getOpcode() == ISD::FMA) {
     if (HasNeg || NewOpcode != N->getOpcode())
       return DAG.getNode(NewOpcode, dl, VT, A, B, C);
     return SDValue();
   }
 
   if (N->getOpcode() == X86ISD::FMADD_RND) {
     switch (NewOpcode) {
     case ISD::FMA:       NewOpcode = X86ISD::FMADD_RND; break;
     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
     }
   } else if (N->getOpcode() == X86ISD::FMADDS1) {
     switch (NewOpcode) {
     case ISD::FMA:       NewOpcode = X86ISD::FMADDS1; break;
     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1; break;
     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
     }
   } else if (N->getOpcode() == X86ISD::FMADDS3) {
     switch (NewOpcode) {
     case ISD::FMA:       NewOpcode = X86ISD::FMADDS3; break;
     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3; break;
     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
     }
   } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
     switch (NewOpcode) {
     case ISD::FMA:       NewOpcode = X86ISD::FMADDS1_RND; break;
     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1_RND; break;
     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
     }
   } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
     switch (NewOpcode) {
     case ISD::FMA:       NewOpcode = X86ISD::FMADDS3_RND; break;
     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3_RND; break;
     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
     }
   } else if (N->getOpcode() == X86ISD::FMADD4S) {
     switch (NewOpcode) {
     case ISD::FMA:       NewOpcode = X86ISD::FMADD4S; break;
     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB4S; break;
     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
     }
   } else {
     llvm_unreachable("Unexpected opcode!");
   }
 
   // Only return the node is the opcode was changed or one of the
   // operand was negated. If not, we'll just recreate the same node.
   if (HasNeg || NewOpcode != N->getOpcode()) {
     if (N->getNumOperands() == 4)
       return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
   }
 
   return SDValue();
 }
 
 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   SDValue NegVal = isFNEG(N->getOperand(2).getNode());
   if (!NegVal)
     return SDValue();
 
   unsigned NewOpcode;
   switch (N->getOpcode()) {
   default: llvm_unreachable("Unexpected opcode!");
   case X86ISD::FMADDSUB:     NewOpcode = X86ISD::FMSUBADD;     break;
   case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
   case X86ISD::FMSUBADD:     NewOpcode = X86ISD::FMADDSUB;     break;
   case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
   }
 
   if (N->getNumOperands() == 4)
     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
                        NegVal, N->getOperand(3));
   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
                      NegVal);
 }
 
 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   //           (and (i32 x86isd::setcc_carry), 1)
   // This eliminates the zext. This transformation is necessary because
   // ISD::SETCC is always legalized to i8.
   SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   if (N0.getOpcode() == ISD::AND &&
       N0.hasOneUse() &&
       N0.getOperand(0).hasOneUse()) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
       if (!isOneConstant(N0.getOperand(1)))
         return SDValue();
       return DAG.getNode(ISD::AND, dl, VT,
                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
                                      N00.getOperand(0), N00.getOperand(1)),
                          DAG.getConstant(1, dl, VT));
     }
   }
 
   if (N0.getOpcode() == ISD::TRUNCATE &&
       N0.hasOneUse() &&
       N0.getOperand(0).hasOneUse()) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
       return DAG.getNode(ISD::AND, dl, VT,
                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
                                      N00.getOperand(0), N00.getOperand(1)),
                          DAG.getConstant(1, dl, VT));
     }
   }
 
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
   if (VT.isVector())
     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
       return R;
 
   if (SDValue DivRem8 = getDivRem8(N, DAG))
     return DivRem8;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
     return NewAdd;
 
   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
     return R;
 
   return SDValue();
 }
 
 /// Try to map a 128-bit or larger integer comparison to vector instructions
 /// before type legalization splits it up into chunks.
 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
                                                const X86Subtarget &Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
 
   // We're looking for an oversized integer equality comparison.
   SDValue X = SetCC->getOperand(0);
   SDValue Y = SetCC->getOperand(1);
   EVT OpVT = X.getValueType();
   unsigned OpSize = OpVT.getSizeInBits();
   if (!OpVT.isScalarInteger() || OpSize < 128)
     return SDValue();
 
   // Ignore a comparison with zero because that gets special treatment in
   // EmitTest(). But make an exception for the special case of a pair of
   // logically-combined vector-sized operands compared to zero. This pattern may
   // be generated by the memcmp expansion pass with oversized integer compares
   // (see PR33325).
   bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
                           X.getOperand(0).getOpcode() == ISD::XOR &&
                           X.getOperand(1).getOpcode() == ISD::XOR;
   if (isNullConstant(Y) && !IsOrXorXorCCZero)
     return SDValue();
 
   // Bail out if we know that this is not really just an oversized integer.
   if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
       peekThroughBitcasts(Y).getValueType() == MVT::f128)
     return SDValue();
 
   // TODO: Use PXOR + PTEST for SSE4.1 or later?
   // TODO: Add support for AVX-512.
   EVT VT = SetCC->getValueType(0);
   SDLoc DL(SetCC);
   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
       (OpSize == 256 && Subtarget.hasAVX2())) {
     EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
     SDValue Cmp;
     if (IsOrXorXorCCZero) {
       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
       // Use 2 vector equality compares and 'and' the results before doing a
       // MOVMSK.
       SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
       SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
       SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
       SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
       SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
       SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
       Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
     } else {
       SDValue VecX = DAG.getBitcast(VecVT, X);
       SDValue VecY = DAG.getBitcast(VecVT, Y);
       Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
     }
     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
     // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
     // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
     SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
                                     MVT::i32);
     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
   }
 
   return SDValue();
 }
 
 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
     EVT OpVT = LHS.getValueType();
     // 0-x == y --> x+y == 0
     // 0-x != y --> x+y != 0
     if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
         LHS.hasOneUse()) {
       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
     // x == 0-y --> x+y == 0
     // x != 0-y --> x+y != 0
     if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
         RHS.hasOneUse()) {
       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
 
     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
       return V;
   }
 
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
     // Put build_vectors on the right.
     if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
       std::swap(LHS, RHS);
       CC = ISD::getSetCCSwappedOperands(CC);
     }
 
     bool IsSEXT0 =
         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
         (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
 
     if (IsSEXT0 && IsVZero1) {
       assert(VT == LHS.getOperand(0).getValueType() &&
              "Uexpected operand type");
       if (CC == ISD::SETGT)
         return DAG.getConstant(0, DL, VT);
       if (CC == ISD::SETLE)
         return DAG.getConstant(1, DL, VT);
       if (CC == ISD::SETEQ || CC == ISD::SETGE)
         return DAG.getNOT(DL, LHS.getOperand(0), VT);
 
       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
              "Unexpected condition code!");
       return LHS.getOperand(0);
     }
   }
 
   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
   // to avoid scalarization via legalization because v4i32 is not a legal type.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
       LHS.getValueType() == MVT::v4f32)
     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
 
   return SDValue();
 }
 
 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
   SDValue Src = N->getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                         !DCI.isBeforeLegalizeOps());
 
   // MOVMSK only uses the MSB from each vector element.
   KnownBits Known;
   APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
   if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
     DCI.AddToWorklist(Src.getNode());
     DCI.CommitTargetLoweringOpt(TLO);
     return SDValue(N, 0);
   }
 
   return SDValue();
 }
 
 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget &Subtarget) {
   SDLoc DL(N);
 
   if (DCI.isBeforeLegalizeOps()) {
     SDValue Index = N->getOperand(4);
     // Remove any sign extends from 32 or smaller to larger than 32.
     // Only do this before LegalizeOps in case we need the sign extend for
     // legalization.
     if (Index.getOpcode() == ISD::SIGN_EXTEND) {
       if (Index.getScalarValueSizeInBits() > 32 &&
           Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
         SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
         NewOps[4] = Index.getOperand(0);
         DAG.UpdateNodeOperands(N, NewOps);
         // The original sign extend has less users, add back to worklist in case
         // it needs to be removed
         DCI.AddToWorklist(Index.getNode());
         DCI.AddToWorklist(N);
         return SDValue(N, 0);
       }
     }
 
     // Make sure the index is either i32 or i64
     unsigned ScalarSize = Index.getScalarValueSizeInBits();
     if (ScalarSize != 32 && ScalarSize != 64) {
       MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
       EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                                    Index.getValueType().getVectorNumElements());
       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
       SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
       NewOps[4] = Index;
       DAG.UpdateNodeOperands(N, NewOps);
       DCI.AddToWorklist(N);
       return SDValue(N, 0);
     }
 
     // Try to remove zero extends from 32->64 if we know the sign bit of
     // the input is zero.
     if (Index.getOpcode() == ISD::ZERO_EXTEND &&
         Index.getScalarValueSizeInBits() == 64 &&
         Index.getOperand(0).getScalarValueSizeInBits() == 32) {
       if (DAG.SignBitIsZero(Index.getOperand(0))) {
         SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
         NewOps[4] = Index.getOperand(0);
         DAG.UpdateNodeOperands(N, NewOps);
         // The original zero extend has less users, add back to worklist in case
         // it needs to be removed
         DCI.AddToWorklist(Index.getNode());
         DCI.AddToWorklist(N);
         return SDValue(N, 0);
       }
     }
   }
 
   // Gather and Scatter instructions use k-registers for masks. The type of
   // the masks is v*i1. So the mask will be truncated anyway.
   // The SIGN_EXTEND_INREG my be dropped.
   SDValue Mask = N->getOperand(2);
   if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
     NewOps[2] = Mask.getOperand(0);
     DAG.UpdateNodeOperands(N, NewOps);
     return SDValue(N, 0);
   }
 
   // With AVX2 we only demand the upper bit of the mask.
   if (!Subtarget.hasAVX512()) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                           !DCI.isBeforeLegalizeOps());
     KnownBits Known;
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
       DCI.AddToWorklist(Mask.getNode());
       DCI.CommitTargetLoweringOpt(TLO);
       return SDValue(N, 0);
     }
   }
 
   return SDValue();
 }
 
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
 
   // Try to simplify the EFLAGS and condition code operands.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
     return getSETCC(CC, Flags, DL, DAG);
 
   return SDValue();
 }
 
 /// Optimize branch condition evaluation.
 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue EFLAGS = N->getOperand(3);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
 
   // Try to simplify the EFLAGS and condition code operands.
   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
   // RAUW them under us.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
                        N->getOperand(1), Cond, Flags);
   }
 
   return SDValue();
 }
 
 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
                                                   SelectionDAG &DAG) {
   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   // optimize away operation when it's from a constant.
   //
   // The general transformation is:
   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
   //       AND(VECTOR_CMP(x,y), constant2)
   //    constant2 = UNARYOP(constant)
 
   // Early exit if this isn't a vector operation, the operand of the
   // unary operation isn't a bitwise AND, or if the sizes of the operations
   // aren't the same.
   EVT VT = N->getValueType(0);
   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
       VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
     return SDValue();
 
   // Now check that the other operand of the AND is a constant. We could
   // make the transformation for non-constant splats as well, but it's unclear
   // that would be a benefit as it would not eliminate any operations, just
   // perform one more step in scalar code before moving to the vector unit.
   if (BuildVectorSDNode *BV =
           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
     // Bail out if the vector isn't a constant.
     if (!BV->isConstant())
       return SDValue();
 
     // Everything checks out. Build up the new and improved node.
     SDLoc DL(N);
     EVT IntVT = BV->getValueType(0);
     // Create a new constant of the appropriate type for the transformed
     // DAG.
     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
     // The AND node needs bitcasts to/from an integer vector type around it.
     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
                                  N->getOperand(0)->getOperand(0), MaskConst);
     SDValue Res = DAG.getBitcast(VT, NewAnd);
     return Res;
   }
 
   return SDValue();
 }
 
 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
   EVT InSVT = InVT.getScalarType();
 
   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
     SDLoc dl(N);
     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements());
     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
 
     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   }
 
   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   // the optimization here.
   if (DAG.SignBitIsZero(Op0))
     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
 
   return SDValue();
 }
 
 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   // First try to optimize away the conversion entirely when it's
   // conditionally from a constant. Vectors only.
   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
     return Res;
 
   // Now move on to more general possibilities.
   SDValue Op0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
   EVT InSVT = InVT.getScalarType();
 
   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
   if (InVT.isVector() &&
       (InSVT == MVT::i8 || InSVT == MVT::i16 ||
        (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
     SDLoc dl(N);
     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements());
     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   }
 
   // Without AVX512DQ we only support i64 to float scalar conversion. For both
   // vectors and scalars, see if we know that the upper bits are all the sign
   // bit, in which case we can truncate the input to i32 and convert from that.
   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
     unsigned BitWidth = InVT.getScalarSizeInBits();
     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
     if (NumSignBits >= (BitWidth - 31)) {
       EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
       if (InVT.isVector())
         TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
                                    InVT.getVectorNumElements());
       SDLoc dl(N);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
       return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
     }
   }
 
   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   // a 32-bit target where SSE doesn't support i64->FP operations.
   if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
     EVT LdVT = Ld->getValueType(0);
 
     // This transformation is not supported if the result type is f16 or f128.
     if (VT == MVT::f16 || VT == MVT::f128)
       return SDValue();
 
     if (!Ld->isVolatile() && !VT.isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
         !Subtarget.is64Bit() && LdVT == MVT::i64) {
       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
       return FILDChain;
     }
   }
   return SDValue();
 }
 
 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
     MVT VT = N->getSimpleValueType(0);
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
                        N->getOperand(0), N->getOperand(1),
                        Flags);
   }
 
   return SDValue();
 }
 
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI) {
   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   // the result is either zero or one (depending on the input carry bit).
   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
   if (X86::isZeroNode(N->getOperand(0)) &&
       X86::isZeroNode(N->getOperand(1)) &&
       // We don't have a good way to replace an EFLAGS use, so only do this when
       // dead right now.
       SDValue(N, 1).use_empty()) {
     SDLoc DL(N);
     EVT VT = N->getValueType(0);
     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                                            DAG.getConstant(X86::COND_B, DL,
                                                            MVT::i8),
                                            N->getOperand(2)),
                                DAG.getConstant(1, DL, VT));
     return DCI.CombineTo(N, Res1, CarryOut);
   }
 
   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
     MVT VT = N->getSimpleValueType(0);
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
                        N->getOperand(0), N->getOperand(1),
                        Flags);
   }
 
   return SDValue();
 }
 
 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
 /// which is more useful than 0/1 in some cases.
 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
   SDLoc DL(N);
   // "Condition code B" is also known as "the carry flag" (CF).
   SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
   SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
   MVT VT = N->getSimpleValueType(0);
   if (VT == MVT::i8)
     return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
 
   assert(VT == MVT::i1 && "Unexpected type for SETCC node");
   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
 }
 
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
   bool IsSub = N->getOpcode() == ISD::SUB;
   SDValue X = N->getOperand(0);
   SDValue Y = N->getOperand(1);
 
   // If this is an add, canonicalize a zext operand to the RHS.
   // TODO: Incomplete? What if both sides are zexts?
   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
       Y.getOpcode() != ISD::ZERO_EXTEND)
     std::swap(X, Y);
 
   // Look through a one-use zext.
   bool PeekedThroughZext = false;
   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
     Y = Y.getOperand(0);
     PeekedThroughZext = true;
   }
 
   // If this is an add, canonicalize a setcc operand to the RHS.
   // TODO: Incomplete? What if both sides are setcc?
   // TODO: Should we allow peeking through a zext of the other operand?
   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
       Y.getOpcode() != X86ISD::SETCC)
     std::swap(X, Y);
 
   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
     return SDValue();
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
 
   // If X is -1 or 0, then we have an opportunity to avoid constants required in
   // the general case below.
   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
   if (ConstantX) {
     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
         (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
       // This is a complicated way to get -1 or 0 from the carry flag:
       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
                          Y.getOperand(1));
     }
 
     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
         (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
       SDValue EFLAGS = Y->getOperand(1);
       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
           EFLAGS.getValueType().isInteger() &&
           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
         // Swap the operands of a SUB, and we have the same pattern as above.
         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
         SDValue NewSub = DAG.getNode(
             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                            DAG.getConstant(X86::COND_B, DL, MVT::i8),
                            NewEFLAGS);
       }
     }
   }
 
   if (CC == X86::COND_B) {
     // X + SETB Z --> X + (mask SBB Z, Z)
     // X - SETB Z --> X - (mask SBB Z, Z)
     // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
     SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
     if (SBB.getValueSizeInBits() != VT.getSizeInBits())
       SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
     return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
   }
 
   if (CC == X86::COND_A) {
     SDValue EFLAGS = Y->getOperand(1);
     // Try to convert COND_A into COND_B in an attempt to facilitate
     // materializing "setb reg".
     //
     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
     // cannot take an immediate as its first operand.
     //
     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
         EFLAGS.getValueType().isInteger() &&
         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
                                    EFLAGS.getNode()->getVTList(),
                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
       SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
       if (SBB.getValueSizeInBits() != VT.getSizeInBits())
         SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
       return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
     }
   }
 
   if (CC != X86::COND_E && CC != X86::COND_NE)
     return SDValue();
 
   SDValue Cmp = Y.getOperand(1);
   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
       !X86::isZeroNode(Cmp.getOperand(1)) ||
       !Cmp.getOperand(0).getValueType().isInteger())
     return SDValue();
 
   SDValue Z = Cmp.getOperand(0);
   EVT ZVT = Z.getValueType();
 
   // If X is -1 or 0, then we have an opportunity to avoid constants required in
   // the general case below.
   if (ConstantX) {
     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
     // fake operands:
     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
     if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
       SDValue Zero = DAG.getConstant(0, DL, ZVT);
       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
                          SDValue(Neg.getNode(), 1));
     }
 
     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
     // with fake operands:
     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
       SDValue One = DAG.getConstant(1, DL, ZVT);
       SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                          DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
     }
   }
 
   // (cmp Z, 1) sets the carry flag if Z is 0.
   SDValue One = DAG.getConstant(1, DL, ZVT);
   SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
 
   // Add the flags type for ADC/SBB nodes.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   if (CC == X86::COND_NE)
     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
                        DAG.getConstant(-1ULL, DL, VT), Cmp1);
 
   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
                      DAG.getConstant(0, DL, VT), Cmp1);
 }
 
 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   SDValue MulOp = N->getOperand(0);
   SDValue Phi = N->getOperand(1);
 
   if (MulOp.getOpcode() != ISD::MUL)
     std::swap(MulOp, Phi);
   if (MulOp.getOpcode() != ISD::MUL)
     return SDValue();
 
   ShrinkMode Mode;
   if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   unsigned RegSize = 128;
   if (Subtarget.hasBWI())
     RegSize = 512;
   else if (Subtarget.hasAVX2())
     RegSize = 256;
   unsigned VectorSize = VT.getVectorNumElements() * 16;
   // If the vector size is less than 128, or greater than the supported RegSize,
   // do not use PMADD.
   if (VectorSize < 128 || VectorSize > RegSize)
     return SDValue();
 
   SDLoc DL(N);
   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                    VT.getVectorNumElements());
   EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                 VT.getVectorNumElements() / 2);
 
   // Shrink the operands of mul.
   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
 
   // Madd vector size is half of the original vector size
   SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
   // Fill the rest of the output with 0
   SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
   return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
 }
 
 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   // TODO: There's nothing special about i32, any integer type above i16 should
   // work just as well.
   if (!VT.isVector() || !VT.isSimple() ||
       !(VT.getVectorElementType() == MVT::i32))
     return SDValue();
 
   unsigned RegSize = 128;
   if (Subtarget.hasBWI())
     RegSize = 512;
   else if (Subtarget.hasAVX2())
     RegSize = 256;
 
   // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
   if (VT.getSizeInBits() / 4 > RegSize)
     return SDValue();
 
   // We know N is a reduction add, which means one of its operands is a phi.
   // To match SAD, we need the other operand to be a vector select.
   SDValue SelectOp, Phi;
   if (Op0.getOpcode() == ISD::VSELECT) {
     SelectOp = Op0;
     Phi = Op1;
   } else if (Op1.getOpcode() == ISD::VSELECT) {
     SelectOp = Op1;
     Phi = Op0;
   } else
     return SDValue();
 
   // Check whether we have an abs-diff pattern feeding into the select.
   if(!detectZextAbsDiff(SelectOp, Op0, Op1))
     return SDValue();
 
   // SAD pattern detected. Now build a SAD instruction and an addition for
   // reduction. Note that the number of elements of the result of SAD is less
   // than the number of elements of its input. Therefore, we could only update
   // part of elements in the reduction vector.
   SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
 
   // The output of PSADBW is a vector of i64.
   // We need to turn the vector of i64 into a vector of i32.
   // If the reduction vector is at least as wide as the psadbw result, just
   // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
   // anyway.
   MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
   if (VT.getSizeInBits() >= ResVT.getSizeInBits())
     Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
   else
     Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
 
   if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
     // Fill the upper elements with zero to match the add width.
     SDValue Zero = DAG.getConstant(0, DL, VT);
     Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
                       DAG.getIntPtrConstant(0, DL));
   }
 
   return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
 }
 
 /// Convert vector increment or decrement to sub/add with an all-ones constant:
 /// add X, <1, 1...> --> sub X, <-1, -1...>
 /// sub X, <1, 1...> --> add X, <-1, -1...>
 /// The all-ones vector constant can be materialized using a pcmpeq instruction
 /// that is commonly recognized as an idiom (has no register dependency), so
 /// that's better/smaller than loading a splat 1 constant.
 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
          "Unexpected opcode for increment/decrement transform");
 
   // Pseudo-legality check: getOnesVector() expects one of these types, so bail
   // out and wait for legalization if we have an unsupported vector length.
   EVT VT = N->getValueType(0);
   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
     return SDValue();
 
   SDNode *N1 = N->getOperand(1).getNode();
   APInt SplatVal;
   if (!ISD::isConstantSplatVector(N1, SplatVal) ||
       !SplatVal.isOneValue())
     return SDValue();
 
   SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
   unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
   return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
 }
 
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   const SDNodeFlags Flags = N->getFlags();
   if (Flags.hasVectorReduction()) {
     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
       return Sad;
     if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
       return MAdd;
   }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   // Try to synthesize horizontal adds from adds of shuffles.
   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
   if (SDValue V = combineIncDecVector(N, DAG))
     return V;
 
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   // PSUBUS is supported, starting from SSE2, but special preprocessing
   // for v8i32 requires umin, which appears in SSE41.
   if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
       !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
       !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
       !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
         (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
          VT == MVT::v8i64)))
     return SDValue();
 
   SDValue SubusLHS, SubusRHS;
   // Try to find umax(a,b) - b or a - umin(a,b) patterns
   // they may be converted to subus(a,b).
   // TODO: Need to add IR cannonicialization for this code.
   if (Op0.getOpcode() == ISD::UMAX) {
     SubusRHS = Op1;
     SDValue MaxLHS = Op0.getOperand(0);
     SDValue MaxRHS = Op0.getOperand(1);
     if (MaxLHS == Op1)
       SubusLHS = MaxRHS;
     else if (MaxRHS == Op1)
       SubusLHS = MaxLHS;
     else
       return SDValue();
   } else if (Op1.getOpcode() == ISD::UMIN) {
     SubusLHS = Op0;
     SDValue MinLHS = Op1.getOperand(0);
     SDValue MinRHS = Op1.getOperand(1);
     if (MinLHS == Op0)
       SubusRHS = MinRHS;
     else if (MinRHS == Op0)
       SubusRHS = MinLHS;
     else
       return SDValue();
   } else
     return SDValue();
 
   // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
   // special preprocessing in some cases.
   if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
     return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
 
   // Special preprocessing case can be only applied
   // if the value was zero extended from 16 bit,
   // so we require first 16 bits to be zeros for 32 bit
   // values, or first 48 bits for 64 bit values.
   KnownBits Known;
   DAG.computeKnownBits(SubusLHS, Known);
   unsigned NumZeros = Known.countMinLeadingZeros();
   if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
     return SDValue();
 
   EVT ExtType = SubusLHS.getValueType();
   EVT ShrinkedType;
   if (VT == MVT::v8i32 || VT == MVT::v8i64)
     ShrinkedType = MVT::v8i16;
   else
     ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
 
   // If SubusLHS is zeroextended - truncate SubusRHS to it's
   // size SubusRHS = umin(0xFFF.., SubusRHS).
   SDValue SaturationConst =
       DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
                                            ShrinkedType.getScalarSizeInBits()),
                       SDLoc(SubusLHS), ExtType);
   SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
                              SaturationConst);
   SDValue NewSubusLHS =
       DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
   SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
   SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
                                NewSubusLHS, NewSubusRHS);
   // Zero extend the result, it may be used somewhere as 32 bit,
   // if not zext and following trunc will shrink.
   return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
 }
 
 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   // X86 can't encode an immediate LHS of a sub. See if we can push the
   // negation into a preceding instruction.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
     // If the RHS of the sub is a XOR with one use and a constant, invert the
     // immediate. Then add one to the LHS of the sub so we can turn
     // X-Y -> X+~Y+1, saving one register.
     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
         isa<ConstantSDNode>(Op1.getOperand(1))) {
       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
       EVT VT = Op0.getValueType();
       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
                                    Op1.getOperand(0),
                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
     }
   }
 
   // Try to synthesize horizontal subs from subs of shuffles.
   EVT VT = N->getValueType(0);
   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, false))
     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
   if (SDValue V = combineIncDecVector(N, DAG))
     return V;
 
   // Try to create PSUBUS if SUB's argument is max/min
   if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
     return V;
 
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalize())
     return SDValue();
 
   SDLoc DL(N);
   unsigned Opcode = N->getOpcode();
   MVT VT = N->getSimpleValueType(0);
   MVT SVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSizeInBits = SVT.getSizeInBits();
 
   SDValue Op = N->getOperand(0);
   MVT OpVT = Op.getSimpleValueType();
   MVT OpEltVT = OpVT.getVectorElementType();
   unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
   unsigned InputBits = OpEltSizeInBits * NumElts;
 
   // Perform any constant folding.
   // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
   APInt UndefElts;
   SmallVector<APInt, 64> EltBits;
   if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
     APInt Undefs(NumElts, 0);
     SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
     bool IsZEXT =
         (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
     for (unsigned i = 0; i != NumElts; ++i) {
       if (UndefElts[i]) {
         Undefs.setBit(i);
         continue;
       }
       Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
                        : EltBits[i].sextOrTrunc(EltSizeInBits);
     }
     return getConstVector(Vals, Undefs, VT, DAG, DL);
   }
 
   // (vzext (bitcast (vzext (x)) -> (vzext x)
   // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
   SDValue V = peekThroughBitcasts(Op);
   if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
     MVT InnerVT = V.getSimpleValueType();
     MVT InnerEltVT = InnerVT.getVectorElementType();
 
     // If the element sizes match exactly, we can just do one larger vzext. This
     // is always an exact type match as vzext operates on integer types.
     if (OpEltVT == InnerEltVT) {
       assert(OpVT == InnerVT && "Types must match for vzext!");
       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
     }
 
     // The only other way we can combine them is if only a single element of the
     // inner vzext is used in the input to the outer vzext.
     if (InnerEltVT.getSizeInBits() < InputBits)
       return SDValue();
 
     // In this case, the inner vzext is completely dead because we're going to
     // only look at bits inside of the low element. Just do the outer vzext on
     // a bitcast of the input to the inner.
     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
   }
 
   // Check if we can bypass extracting and re-inserting an element of an input
   // vector. Essentially:
   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
   // TODO: Add X86ISD::VSEXT support
   if (Opcode == X86ISD::VZEXT &&
       V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
     SDValue ExtractedV = V.getOperand(0);
     SDValue OrigV = ExtractedV.getOperand(0);
     if (isNullConstant(ExtractedV.getOperand(1))) {
         MVT OrigVT = OrigV.getSimpleValueType();
         // Extract a subvector if necessary...
         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
                                     OrigVT.getVectorNumElements() / Ratio);
           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
                               DAG.getIntPtrConstant(0, DL));
         }
         Op = DAG.getBitcast(OpVT, OrigV);
         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
       }
   }
 
   return SDValue();
 }
 
 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   MVT VT = N->getSimpleValueType(0);
   SDLoc DL(N);
 
   // TEST (AND a, b) ,(AND a, b) -> TEST a, b
   if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
     return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
                        Op0->getOperand(1));
 
   // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
   // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
   if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
       ISD::isBuildVectorAllZeros(Op1.getNode()))
     return getZeroVector(VT, Subtarget, DAG, DL);
 
   return SDValue();
 }
 
 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
   SDLoc DL(N);
 
   if (N->getOperand(0) == N->getOperand(1)) {
     if (N->getOpcode() == X86ISD::PCMPEQ)
       return getOnesVector(VT, DAG, DL);
     if (N->getOpcode() == X86ISD::PCMPGT)
       return getZeroVector(VT, Subtarget, DAG, DL);
   }
 
   return SDValue();
 }
 
 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   MVT OpVT = N->getSimpleValueType(0);
 
   // Early out for mask vectors.
   if (OpVT.getVectorElementType() == MVT::i1)
     return SDValue();
 
   SDLoc dl(N);
   SDValue Vec = N->getOperand(0);
   SDValue SubVec = N->getOperand(1);
 
   unsigned IdxVal = N->getConstantOperandVal(2);
   MVT SubVecVT = SubVec.getSimpleValueType();
 
   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
     // Inserting zeros into zeros is a nop.
     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
       return Vec;
 
     // If we're inserting into a zero vector and then into a larger zero vector,
     // just insert into the larger zero vector directly.
     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
       unsigned Idx2Val = SubVec.getConstantOperandVal(2);
       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
                          SubVec.getOperand(1),
                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
     }
 
     // If we're inserting a bitcast into zeros, rewrite the insert and move the
     // bitcast to the other side. This helps with detecting zero extending
     // during isel.
     // TODO: Is this useful for other indices than 0?
     if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
       MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
       unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
       MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
       SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
                                    DAG.getBitcast(NewVT, Vec),
                                    SubVec.getOperand(0), N->getOperand(2));
       return DAG.getBitcast(OpVT, Insert);
     }
   }
 
   // If this is an insert of an extract, combine to a shuffle. Don't do this
   // if the insert or extract can be represented with a subregister operation.
   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
       (IdxVal != 0 || !Vec.isUndef())) {
     int ExtIdxVal = SubVec.getConstantOperandVal(1);
     if (ExtIdxVal != 0) {
       int VecNumElts = OpVT.getVectorNumElements();
       int SubVecNumElts = SubVecVT.getVectorNumElements();
       SmallVector<int, 64> Mask(VecNumElts);
       // First create an identity shuffle mask.
       for (int i = 0; i != VecNumElts; ++i)
         Mask[i] = i;
       // Now insert the extracted portion.
       for (int i = 0; i != SubVecNumElts; ++i)
         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
 
       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
     }
   }
 
   // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
   // load:
   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
   //                   (load16 addr + 16), Elts/2)
   // --> load32 addr
   // or:
   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
   //                   (load32 addr + 32), Elts/2)
   // --> load64 addr
   // or a 16-byte or 32-byte broadcast:
   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
   //                   (load16 addr), Elts/2)
   // --> X86SubVBroadcast(load16 addr)
   // or:
   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
   //                   (load32 addr), Elts/2)
   // --> X86SubVBroadcast(load32 addr)
   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
       OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
     if (Idx2 && Idx2->getZExtValue() == 0) {
       SDValue SubVec2 = Vec.getOperand(1);
       // If needed, look through bitcasts to get to the load.
       if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
         bool Fast;
         unsigned Alignment = FirstLd->getAlignment();
         unsigned AS = FirstLd->getAddressSpace();
         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
                                     OpVT, AS, Alignment, &Fast) && Fast) {
           SDValue Ops[] = {SubVec2, SubVec};
           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
                                                     Subtarget, false))
             return Ld;
         }
       }
       // If lower/upper loads are the same and the only users of the load, then
       // lower to a VBROADCASTF128/VBROADCASTI128/etc.
       if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
         if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
             SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
           return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
 
       // If this is subv_broadcast insert into both halves, use a larger
       // subv_broadcast.
       if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
         return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
                            SubVec.getOperand(0));
 
       // If we're inserting all zeros into the upper half, change this to
       // an insert into an all zeros vector. We will match this to a move
       // with implicit upper bit zeroing during isel.
       if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
                            getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
                            Vec.getOperand(2));
 
       // If we are inserting into both halves of the vector, the starting
       // vector should be undef. If it isn't, make it so. Only do this if the
       // the early insert has no other uses.
       // TODO: Should this be a generic DAG combine?
       if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
         Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
                           SubVec2, Vec.getOperand(2));
         DCI.AddToWorklist(Vec.getNode());
         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
                            N->getOperand(2));
 
       }
     }
   }
 
   return SDValue();
 }
 
 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   MVT OpVT = N->getSimpleValueType(0);
   SDValue InVec = N->getOperand(0);
   unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
     return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
 
   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
     if (OpVT.getScalarType() == MVT::i1)
       return DAG.getConstant(1, SDLoc(N), OpVT);
     return getOnesVector(OpVT, DAG, SDLoc(N));
   }
 
   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
     return DAG.getBuildVector(
         OpVT, SDLoc(N),
         InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
 
   return SDValue();
 }
 
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
   case X86ISD::PEXTRW:
   case X86ISD::PEXTRB:
     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
   case ISD::INSERT_SUBVECTOR:
     return combineInsertSubvector(N, DAG, DCI, Subtarget);
   case ISD::EXTRACT_SUBVECTOR:
     return combineExtractSubvector(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
   case ISD::SELECT:
   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
   case X86ISD::SBB:         return combineSBB(N, DAG);
   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
   case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
   case ISD::FADD:
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
   case X86ISD::FMIN:
   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
   case ISD::FMINNUM:
   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
   case X86ISD::BT:          return combineBT(N, DAG, DCI);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
   case X86ISD::PACKSS:
   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
   case X86ISD::VSHLI:
   case X86ISD::VSRAI:
   case X86ISD::VSRLI:
     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND_VECTOR_INREG:
   case X86ISD::VSEXT:
   case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
   case X86ISD::PINSRB:
   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::INSERTPS:
   case X86ISD::EXTRQI:
   case X86ISD::INSERTQI:
   case X86ISD::PALIGNR:
   case X86ISD::VSHLDQ:
   case X86ISD::VSRLDQ:
   case X86ISD::BLENDI:
   case X86ISD::UNPCKH:
   case X86ISD::UNPCKL:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVLHPS:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSHDUP:
   case X86ISD::MOVSLDUP:
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
   case X86ISD::VBROADCAST:
   case X86ISD::VPPERM:
   case X86ISD::VPERMI:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
   case X86ISD::VPERMIV3:
   case X86ISD::VPERMIL2:
   case X86ISD::VPERMILPI:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERM2X128:
   case X86ISD::VZEXT_MOVL:
   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
   case X86ISD::FMADD_RND:
   case X86ISD::FMADDS1_RND:
   case X86ISD::FMADDS3_RND:
   case X86ISD::FMADDS1:
   case X86ISD::FMADDS3:
   case X86ISD::FMADD4S:
   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
   case X86ISD::FMADDSUB_RND:
   case X86ISD::FMSUBADD_RND:
   case X86ISD::FMADDSUB:
   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, Subtarget);
   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI);
   case X86ISD::MGATHER:
   case X86ISD::MSCATTER:
   case ISD::MGATHER:
   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI, Subtarget);
   case X86ISD::TESTM:       return combineTestM(N, DAG, Subtarget);
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   }
 
   return SDValue();
 }
 
 /// Return true if the target has native support for the specified value type
 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
 /// some i16 instructions are slow.
 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (!isTypeLegal(VT))
     return false;
   if (VT != MVT::i16)
     return true;
 
   switch (Opc) {
   default:
     return true;
   case ISD::LOAD:
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SUB:
   case ISD::ADD:
   case ISD::MUL:
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
     return false;
   }
 }
 
 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
 /// we don't adjust the stack we clobber the first frame index.
 /// See X86InstrInfo::copyPhysReg.
 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   return any_of(MRI.reg_instructions(X86::EFLAGS),
                 [](const MachineInstr &RI) { return RI.isCopy(); });
 }
 
 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
   if (hasCopyImplyingStackAdjustment(MF)) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
     MFI.setHasCopyImplyingStackAdjustment(true);
   }
 
   TargetLoweringBase::finalizeLowering(MF);
 }
 
 /// This method query the target whether it is beneficial for dag combiner to
 /// promote the specified node. If true, it should return the desired promotion
 /// type by reference.
 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   EVT VT = Op.getValueType();
   if (VT != MVT::i16)
     return false;
 
   bool Promote = false;
   bool Commute = false;
   switch (Op.getOpcode()) {
   default: break;
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
     Promote = true;
     break;
   case ISD::SHL:
   case ISD::SRL: {
     SDValue N0 = Op.getOperand(0);
     // Look out for (store (shl (load), x)).
     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
       return false;
     Promote = true;
     break;
   }
   case ISD::ADD:
   case ISD::MUL:
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
     Commute = true;
     LLVM_FALLTHROUGH;
   case ISD::SUB: {
     SDValue N0 = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
     if (!Commute && MayFoldLoad(N1))
       return false;
     // Avoid disabling potential load folding opportunities.
     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
       return false;
     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
       return false;
     Promote = true;
   }
   }
 
   PVT = MVT::i32;
   return Promote;
 }
 
 bool X86TargetLowering::
     isDesirableToCombineBuildVectorToShuffleTruncate(
         ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
 
   assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
          "Element count mismatch");
   assert(
       Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
       "Shuffle Mask expected to be legal");
 
   // For 32-bit elements VPERMD is better than shuffle+truncate.
   // TODO: After we improve lowerBuildVector, add execption for VPERMW.
   if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
     return false;
 
   if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
     return false;
 
   return true;
 }
 
 //===----------------------------------------------------------------------===//
 //                           X86 Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
 // Helper to match a string separated by whitespace.
 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
 
   for (StringRef Piece : Pieces) {
     if (!S.startswith(Piece)) // Check if the piece matches.
       return false;
 
     S = S.substr(Piece.size());
     StringRef::size_type Pos = S.find_first_not_of(" \t");
     if (Pos == 0) // We matched a prefix.
       return false;
 
     S = S.substr(Pos);
   }
 
   return S.empty();
 }
 
 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
 
   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
 
       if (AsmPieces.size() == 3)
         return true;
       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
         return true;
     }
   }
   return false;
 }
 
 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
 
   const std::string &AsmStr = IA->getAsmString();
 
   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   if (!Ty || Ty->getBitWidth() % 16 != 0)
     return false;
 
   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   SmallVector<StringRef, 4> AsmPieces;
   SplitString(AsmStr, AsmPieces, ";\n");
 
   switch (AsmPieces.size()) {
   default: return false;
   case 1:
     // FIXME: this should verify that we are targeting a 486 or better.  If not,
     // we will turn this bswap into something that will be lowered to logical
     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
     // lower so don't worry about this.
     // bswap $0
     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
       // No need to check constraints, nothing other than the equivalent of
       // "=r,0" would be valid here.
       return IntrinsicLowering::LowerToByteSwap(CI);
     }
 
     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
     if (CI->getType()->isIntegerTy(16) &&
         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
       AsmPieces.clear();
       StringRef ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (clobbersFlagRegisters(AsmPieces))
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
     break;
   case 3:
     if (CI->getType()->isIntegerTy(32) &&
         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
       AsmPieces.clear();
       StringRef ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (clobbersFlagRegisters(AsmPieces))
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
 
     if (CI->getType()->isIntegerTy(64)) {
       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
       if (Constraints.size() >= 2 &&
           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
           return IntrinsicLowering::LowerToByteSwap(CI);
       }
     }
     break;
   }
   return false;
 }
 
 /// Given a constraint letter, return the type of constraint for this target.
 X86TargetLowering::ConstraintType
 X86TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'R':
     case 'q':
     case 'Q':
     case 'f':
     case 't':
     case 'u':
     case 'y':
     case 'x':
     case 'v':
     case 'Y':
     case 'l':
     case 'k': // AVX512 masking registers.
       return C_RegisterClass;
     case 'a':
     case 'b':
     case 'c':
     case 'd':
     case 'S':
     case 'D':
     case 'A':
       return C_Register;
     case 'I':
     case 'J':
     case 'K':
     case 'L':
     case 'M':
     case 'N':
     case 'G':
     case 'C':
     case 'e':
     case 'Z':
       return C_Other;
     default:
       break;
     }
   }
   else if (Constraint.size() == 2) {
     switch (Constraint[0]) {
     default:
       break;
     case 'Y':
       switch (Constraint[1]) {
       default:
         break;
       case 'z':
       case '0':
         return C_Register;
       case 'i':
       case 'm':
       case 'k':
       case 't':
       case '2':
         return C_RegisterClass;
       }
     }
   }
   return TargetLowering::getConstraintType(Constraint);
 }
 
 /// Examine constraint type and operand type and determine a weight value.
 /// This object must already have been set up with the operand type
 /// and the current alternative constraint selected.
 TargetLowering::ConstraintWeight
   X86TargetLowering::getSingleConstraintMatchWeight(
     AsmOperandInfo &info, const char *constraint) const {
   ConstraintWeight weight = CW_Invalid;
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
   if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
     LLVM_FALLTHROUGH;
   case 'R':
   case 'q':
   case 'Q':
   case 'a':
   case 'b':
   case 'c':
   case 'd':
   case 'S':
   case 'D':
   case 'A':
     if (CallOperandVal->getType()->isIntegerTy())
       weight = CW_SpecificReg;
     break;
   case 'f':
   case 't':
   case 'u':
     if (type->isFloatingPointTy())
       weight = CW_SpecificReg;
     break;
   case 'y':
     if (type->isX86_MMXTy() && Subtarget.hasMMX())
       weight = CW_SpecificReg;
     break;
   case 'Y': {
     unsigned Size = StringRef(constraint).size();
     // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
     char NextChar = Size == 2 ? constraint[1] : 'i';
     if (Size > 2)
       break;
     switch (NextChar) {
       default:
         return CW_Invalid;
       // XMM0
       case 'z':
       case '0':
         if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
           return CW_SpecificReg;
         return CW_Invalid;
       // Conditional OpMask regs (AVX512)
       case 'k':
         if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
           return CW_Register;
         return CW_Invalid;
       // Any MMX reg
       case 'm':
         if (type->isX86_MMXTy() && Subtarget.hasMMX())
           return weight;
         return CW_Invalid;
       // Any SSE reg when ISA >= SSE2, same as 'Y'
       case 'i':
       case 't':
       case '2':
         if (!Subtarget.hasSSE2())
           return CW_Invalid;
         break;
     }
     // Fall through (handle "Y" constraint).
     LLVM_FALLTHROUGH;
   }
   case 'v':
     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
       weight = CW_Register;
     LLVM_FALLTHROUGH;
   case 'x':
     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
       weight = CW_Register;
     break;
   case 'k':
     // Enable conditional vector operations using %k<#> registers.
     if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
       weight = CW_Register;
     break;
   case 'I':
     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
       if (C->getZExtValue() <= 31)
         weight = CW_Constant;
     }
     break;
   case 'J':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 63)
         weight = CW_Constant;
     }
     break;
   case 'K':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
         weight = CW_Constant;
     }
     break;
   case 'L':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
         weight = CW_Constant;
     }
     break;
   case 'M':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 3)
         weight = CW_Constant;
     }
     break;
   case 'N':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 0xff)
         weight = CW_Constant;
     }
     break;
   case 'G':
   case 'C':
     if (isa<ConstantFP>(CallOperandVal)) {
       weight = CW_Constant;
     }
     break;
   case 'e':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getSExtValue() >= -0x80000000LL) &&
           (C->getSExtValue() <= 0x7fffffffLL))
         weight = CW_Constant;
     }
     break;
   case 'Z':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 0xffffffff)
         weight = CW_Constant;
     }
     break;
   }
   return weight;
 }
 
 /// Try to replace an X constraint, which matches anything, with another that
 /// has more specific requirements based on the type of the corresponding
 /// operand.
 const char *X86TargetLowering::
 LowerXConstraint(EVT ConstraintVT) const {
   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   // 'f' like normal targets.
   if (ConstraintVT.isFloatingPoint()) {
     if (Subtarget.hasSSE2())
       return "Y";
     if (Subtarget.hasSSE1())
       return "x";
   }
 
   return TargetLowering::LowerXConstraint(ConstraintVT);
 }
 
 /// Lower the specified operand into the Ops vector.
 /// If it is invalid, don't add anything to Ops.
 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result;
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1) return;
 
   char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
   default: break;
   case 'I':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 31) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'J':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 63) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'K':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (isInt<8>(C->getSExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'L':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'M':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 3) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'N':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 255) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'O':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 127) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'e': {
     // 32-bit signed value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
                                            C->getSExtValue())) {
         // Widen to 64 bits here to get it sign extended.
         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
         break;
       }
     // FIXME gcc accepts some relocatable values here too, but only in certain
     // memory models; it's complicated.
     }
     return;
   }
   case 'Z': {
     // 32-bit unsigned value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
                                            C->getZExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     // FIXME gcc accepts some relocatable values here too, but only in certain
     // memory models; it's complicated.
     return;
   }
   case 'i': {
     // Literal immediates are always ok.
     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
       // Widen to 64 bits here to get it sign extended.
       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
       break;
     }
 
     // In any sort of PIC mode addresses need to be computed at runtime by
     // adding in a register or some sort of table lookup.  These can't
     // be used as immediates.
     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
       return;
 
     // If we are in non-pic codegen mode, we allow the address of a global (with
     // an optional displacement) to be used with 'i'.
     GlobalAddressSDNode *GA = nullptr;
     int64_t Offset = 0;
 
     // Match either (GA), (GA+C), (GA+C1+C2), etc.
     while (1) {
       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
         Offset += GA->getOffset();
         break;
       } else if (Op.getOpcode() == ISD::ADD) {
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
           Offset += C->getZExtValue();
           Op = Op.getOperand(0);
           continue;
         }
       } else if (Op.getOpcode() == ISD::SUB) {
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
           Offset += -C->getZExtValue();
           Op = Op.getOperand(0);
           continue;
         }
       }
 
       // Otherwise, this isn't something we can handle, reject it.
       return;
     }
 
     const GlobalValue *GV = GA->getGlobal();
     // If we require an extra load to get this address, as in PIC mode, we
     // can't accept it.
     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
       return;
 
     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
                                         GA->getValueType(0), Offset);
     break;
   }
   }
 
   if (Result.getNode()) {
     Ops.push_back(Result);
     return;
   }
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 /// Check if \p RC is a general purpose register class.
 /// I.e., GR* or one of their variant.
 static bool isGRClass(const TargetRegisterClass &RC) {
   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
          RC.hasSuperClassEq(&X86::GR16RegClass) ||
          RC.hasSuperClassEq(&X86::GR32RegClass) ||
          RC.hasSuperClassEq(&X86::GR64RegClass) ||
          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
 }
 
 /// Check if \p RC is a vector register class.
 /// I.e., FR* / VR* or one of their variant.
 static bool isFRClass(const TargetRegisterClass &RC) {
   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
          RC.hasSuperClassEq(&X86::VR512RegClass);
 }
 
 std::pair<unsigned, const TargetRegisterClass *>
 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                 StringRef Constraint,
                                                 MVT VT) const {
   // First, see if this is a constraint that directly corresponds to an LLVM
   // register class.
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
     default: break;
       // TODO: Slight differences here in allocation order and leaving
       // RIP in the class. Do they matter any more here than they do
       // in the normal allocation?
     case 'k':
       if (Subtarget.hasAVX512()) {
         //  Only supported in AVX512 or later.
         switch (VT.SimpleTy) {
         default: break;
         case MVT::i32:
           return std::make_pair(0U, &X86::VK32RegClass);
         case MVT::i16:
           return std::make_pair(0U, &X86::VK16RegClass);
         case MVT::i8:
           return std::make_pair(0U, &X86::VK8RegClass);
         case MVT::i1:
           return std::make_pair(0U, &X86::VK1RegClass);
         case MVT::i64:
           return std::make_pair(0U, &X86::VK64RegClass);
         }
       }
       break;
     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
       if (Subtarget.is64Bit()) {
         if (VT == MVT::i32 || VT == MVT::f32)
           return std::make_pair(0U, &X86::GR32RegClass);
         if (VT == MVT::i16)
           return std::make_pair(0U, &X86::GR16RegClass);
         if (VT == MVT::i8 || VT == MVT::i1)
           return std::make_pair(0U, &X86::GR8RegClass);
         if (VT == MVT::i64 || VT == MVT::f64)
           return std::make_pair(0U, &X86::GR64RegClass);
         break;
       }
       LLVM_FALLTHROUGH;
       // 32-bit fallthrough
     case 'Q':   // Q_REGS
       if (VT == MVT::i32 || VT == MVT::f32)
         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
       if (VT == MVT::i64)
         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
       break;
     case 'r':   // GENERAL_REGS
     case 'l':   // INDEX_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8RegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16RegClass);
       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
         return std::make_pair(0U, &X86::GR32RegClass);
       return std::make_pair(0U, &X86::GR64RegClass);
     case 'R':   // LEGACY_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
       if (VT == MVT::i32 || !Subtarget.is64Bit())
         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
     case 'f':  // FP Stack registers.
       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
       // value to the correct fpstack register class.
       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
         return std::make_pair(0U, &X86::RFP32RegClass);
       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
         return std::make_pair(0U, &X86::RFP64RegClass);
       return std::make_pair(0U, &X86::RFP80RegClass);
     case 'y':   // MMX_REGS if MMX allowed.
       if (!Subtarget.hasMMX()) break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'Y':   // SSE_REGS if SSE2 allowed
       if (!Subtarget.hasSSE2()) break;
       LLVM_FALLTHROUGH;
     case 'v':
     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
       if (!Subtarget.hasSSE1()) break;
       bool VConstraint = (Constraint[0] == 'v');
 
       switch (VT.SimpleTy) {
       default: break;
       // Scalar SSE types.
       case MVT::f32:
       case MVT::i32:
         if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::FR32XRegClass);
         return std::make_pair(0U, &X86::FR32RegClass);
       case MVT::f64:
       case MVT::i64:
         if (VConstraint && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::FR64XRegClass);
         return std::make_pair(0U, &X86::FR64RegClass);
       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
       // Vector types.
       case MVT::v16i8:
       case MVT::v8i16:
       case MVT::v4i32:
       case MVT::v2i64:
       case MVT::v4f32:
       case MVT::v2f64:
         if (VConstraint && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::VR128XRegClass);
         return std::make_pair(0U, &X86::VR128RegClass);
       // AVX types.
       case MVT::v32i8:
       case MVT::v16i16:
       case MVT::v8i32:
       case MVT::v4i64:
       case MVT::v8f32:
       case MVT::v4f64:
         if (VConstraint && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::VR256XRegClass);
         return std::make_pair(0U, &X86::VR256RegClass);
       case MVT::v8f64:
       case MVT::v16f32:
       case MVT::v16i32:
       case MVT::v8i64:
         return std::make_pair(0U, &X86::VR512RegClass);
       }
       break;
     }
   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
     switch (Constraint[1]) {
     default:
       break;
     case 'i':
     case 't':
     case '2':
       return getRegForInlineAsmConstraint(TRI, "Y", VT);
     case 'm':
       if (!Subtarget.hasMMX()) break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'z':
     case '0':
       if (!Subtarget.hasSSE1()) break;
       return std::make_pair(X86::XMM0, &X86::VR128RegClass);
     case 'k':
       // This register class doesn't allocate k0 for masked vector operation.
       if (Subtarget.hasAVX512()) { // Only supported in AVX512.
         switch (VT.SimpleTy) {
         default: break;
         case MVT::i32:
           return std::make_pair(0U, &X86::VK32WMRegClass);
         case MVT::i16:
           return std::make_pair(0U, &X86::VK16WMRegClass);
         case MVT::i8:
           return std::make_pair(0U, &X86::VK8WMRegClass);
         case MVT::i1:
           return std::make_pair(0U, &X86::VK1WMRegClass);
         case MVT::i64:
           return std::make_pair(0U, &X86::VK64WMRegClass);
         }
       }
       break;
     }
   }
 
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass*> Res;
   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // Not found as a standard register?
   if (!Res.second) {
     // Map st(0) -> st(7) -> ST0
     if (Constraint.size() == 7 && Constraint[0] == '{' &&
         tolower(Constraint[1]) == 's' &&
         tolower(Constraint[2]) == 't' &&
         Constraint[3] == '(' &&
         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
         Constraint[5] == ')' &&
         Constraint[6] == '}') {
 
       Res.first = X86::FP0+Constraint[4]-'0';
       Res.second = &X86::RFP80RegClass;
       return Res;
     }
 
     // GCC allows "st(0)" to be called just plain "st".
     if (StringRef("{st}").equals_lower(Constraint)) {
       Res.first = X86::FP0;
       Res.second = &X86::RFP80RegClass;
       return Res;
     }
 
     // flags -> EFLAGS
     if (StringRef("{flags}").equals_lower(Constraint)) {
       Res.first = X86::EFLAGS;
       Res.second = &X86::CCRRegClass;
       return Res;
     }
 
     // 'A' means [ER]AX + [ER]DX.
     if (Constraint == "A") {
       if (Subtarget.is64Bit()) {
         Res.first = X86::RAX;
         Res.second = &X86::GR64_ADRegClass;
       } else {
         assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
                "Expecting 64, 32 or 16 bit subtarget");
         Res.first = X86::EAX;
         Res.second = &X86::GR32_ADRegClass;
       }
       return Res;
     }
     return Res;
   }
 
   // Otherwise, check to see if this is a register class of the wrong value
   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   // turn into {ax},{dx}.
   // MVT::Other is used to specify clobber names.
   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
     return Res;   // Correct type already, nothing to do.
 
   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
   // return "eax". This should even work for things like getting 64bit integer
   // registers when given an f64 type.
   const TargetRegisterClass *Class = Res.second;
   // The generic code will match the first register class that contains the
   // given register. Thus, based on the ordering of the tablegened file,
   // the "plain" GR classes might not come first.
   // Therefore, use a helper method.
   if (isGRClass(*Class)) {
     unsigned Size = VT.getSizeInBits();
     if (Size == 1) Size = 8;
     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
     if (DestReg > 0) {
       bool is64Bit = Subtarget.is64Bit();
       const TargetRegisterClass *RC =
           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
         : &X86::GR64RegClass;
       if (RC->contains(DestReg))
         Res = std::make_pair(DestReg, RC);
     } else {
       // No register found/type mismatch.
       Res.first = 0;
       Res.second = nullptr;
     }
   } else if (isFRClass(*Class)) {
     // Handle references to XMM physical registers that got mapped into the
     // wrong class.  This can happen with constraints like {xmm0} where the
     // target independent register mapper will just pick the first match it can
     // find, ignoring the required type.
 
     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
     if (VT == MVT::f32 || VT == MVT::i32)
       Res.second = &X86::FR32RegClass;
     else if (VT == MVT::f64 || VT == MVT::i64)
       Res.second = &X86::FR64RegClass;
     else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
       Res.second = &X86::VR128RegClass;
     else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
       Res.second = &X86::VR256RegClass;
     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
       Res.second = &X86::VR512RegClass;
     else {
       // Type mismatch and not a clobber: Return an error;
       Res.first = 0;
       Res.second = nullptr;
     }
   }
 
   return Res;
 }
 
 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
                                             const AddrMode &AM, Type *Ty,
                                             unsigned AS) const {
   // Scaling factors are not free at all.
   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
   // will take 2 allocations in the out of order engine instead of 1
   // for plain addressing mode, i.e. inst (reg1).
   // E.g.,
   // vaddps (%rsi,%drx), %ymm0, %ymm1
   // Requires two allocations (one for the load, one for the computation)
   // whereas:
   // vaddps (%rsi), %ymm0, %ymm1
   // Requires just 1 allocation, i.e., freeing allocations for other operations
   // and having less micro operations to execute.
   //
   // For some X86 architectures, this is even worse because for instance for
   // stores, the complex addressing mode forces the instruction to use the
   // "load" ports instead of the dedicated "store" port.
   // E.g., on Haswell:
   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   if (isLegalAddressingMode(DL, AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1
     // as soon as we use a second register.
     return AM.Scale != 0;
   return -1;
 }
 
 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on x86 is expensive. However, when aggressively optimizing
   // for code size, we prefer to use a div instruction, as it is usually smaller
   // than the alternative sequence.
   // The exception to this is vector division. Since x86 doesn't have vector
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
   bool OptSize =
       Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
   if (!Subtarget.is64Bit())
     return;
 
   // Update IsSplitCSR in X86MachineFunctionInfo.
   X86MachineFunctionInfo *AFI =
     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
   AFI->setIsSplitCSR(true);
 }
 
 void X86TargetLowering::insertCopiesSplitCSR(
     MachineBasicBlock *Entry,
     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
   if (!IStart)
     return;
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
   MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (X86::GR64RegClass.contains(*I))
       RC = &X86::GR64RegClass;
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
     unsigned NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
     // nounwind. If we want to generalize this later, we may need to emit
     // CFI pseudo-instructions.
     assert(Entry->getParent()->getFunction().hasFnAttribute(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
     // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
               TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
 
 bool X86TargetLowering::supportSwiftError() const {
   return Subtarget.is64Bit();
 }
 
 /// Returns the name of the symbol used to emit stack probes or the empty
 /// string if not applicable.
 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
   // If the function specifically requests stack probes, emit them.
   if (MF.getFunction().hasFnAttribute("probe-stack"))
     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
 
   // Generally, if we aren't on Windows, the platform ABI does not include
   // support for stack probes, so don't emit them.
   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
     return "";
 
   // We need a stack probe to conform to the Windows ABI. Choose the right
   // symbol.
   if (Subtarget.is64Bit())
     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
 }
Index: head/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
===================================================================
--- head/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp	(revision 328753)
@@ -1,417 +1,418 @@
 //===- CallSiteSplitting.cpp ----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements a transformation that tries to split a call-site to pass
 // more constrained arguments if its argument is predicated in the control flow
 // so that we can expose better context to the later passes (e.g, inliner, jump
 // threading, or IPA-CP based function cloning, etc.).
 // As of now we support two cases :
 //
 // 1) Try to a split call-site with constrained arguments, if any constraints
 // on any argument can be found by following the single predecessors of the
 // all site's predecessors. Currently this pass only handles call-sites with 2
 // predecessors. For example, in the code below, we try to split the call-site
 // since we can predicate the argument(ptr) based on the OR condition.
 //
 // Split from :
 //   if (!ptr || c)
 //     callee(ptr);
 // to :
 //   if (!ptr)
 //     callee(null)         // set the known constant value
 //   else if (c)
 //     callee(nonnull ptr)  // set non-null attribute in the argument
 //
 // 2) We can also split a call-site based on constant incoming values of a PHI
 // For example,
 // from :
 //   Header:
 //    %c = icmp eq i32 %i1, %i2
 //    br i1 %c, label %Tail, label %TBB
 //   TBB:
 //    br label Tail%
 //   Tail:
 //    %p = phi i32 [ 0, %Header], [ 1, %TBB]
 //    call void @bar(i32 %p)
 // to
 //   Header:
 //    %c = icmp eq i32 %i1, %i2
 //    br i1 %c, label %Tail-split0, label %TBB
 //   TBB:
 //    br label %Tail-split1
 //   Tail-split0:
 //    call void @bar(i32 0)
 //    br label %Tail
 //   Tail-split1:
 //    call void @bar(i32 1)
 //    br label %Tail
 //   Tail:
 //    %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/CallSiteSplitting.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "callsite-splitting"
 
 STATISTIC(NumCallSiteSplit, "Number of call-site split");
 
 static void addNonNullAttribute(Instruction *CallI, Instruction *NewCallI,
                                 Value *Op) {
   CallSite CS(NewCallI);
   unsigned ArgNo = 0;
   for (auto &I : CS.args()) {
     if (&*I == Op)
       CS.addParamAttr(ArgNo, Attribute::NonNull);
     ++ArgNo;
   }
 }
 
 static void setConstantInArgument(Instruction *CallI, Instruction *NewCallI,
                                   Value *Op, Constant *ConstValue) {
   CallSite CS(NewCallI);
   unsigned ArgNo = 0;
   for (auto &I : CS.args()) {
     if (&*I == Op)
       CS.setArgument(ArgNo, ConstValue);
     ++ArgNo;
   }
 }
 
 static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
   assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
   Value *Op0 = Cmp->getOperand(0);
   unsigned ArgNo = 0;
   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
        ++I, ++ArgNo) {
     // Don't consider constant or arguments that are already known non-null.
     if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull))
       continue;
 
     if (*I == Op0)
       return true;
   }
   return false;
 }
 
 /// If From has a conditional jump to To, add the condition to Conditions,
 /// if it is relevant to any argument at CS.
 static void
 recordCondition(const CallSite &CS, BasicBlock *From, BasicBlock *To,
                 SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
   auto *BI = dyn_cast<BranchInst>(From->getTerminator());
   if (!BI || !BI->isConditional())
     return;
 
   CmpInst::Predicate Pred;
   Value *Cond = BI->getCondition();
   if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
     return;
 
   ICmpInst *Cmp = cast<ICmpInst>(Cond);
   if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
     if (isCondRelevantToAnyCallArgument(Cmp, CS))
       Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
                                      ? Pred
                                      : Cmp->getInversePredicate()});
 }
 
 /// Record ICmp conditions relevant to any argument in CS following Pred's
 /// single successors. If there are conflicting conditions along a path, like
 /// x == 1 and x == 0, the first condition will be used.
 static void
 recordConditions(const CallSite &CS, BasicBlock *Pred,
                  SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
   recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
   BasicBlock *From = Pred;
   BasicBlock *To = Pred;
-  SmallPtrSet<BasicBlock *, 4> Visited = {From};
+  SmallPtrSet<BasicBlock *, 4> Visited;
   while (!Visited.count(From->getSinglePredecessor()) &&
          (From = From->getSinglePredecessor())) {
     recordCondition(CS, From, To, Conditions);
+    Visited.insert(From);
     To = From;
   }
 }
 
 static Instruction *
 addConditions(CallSite &CS,
               SmallVectorImpl<std::pair<ICmpInst *, unsigned>> &Conditions) {
   if (Conditions.empty())
     return nullptr;
 
   Instruction *NewCI = CS.getInstruction()->clone();
   for (auto &Cond : Conditions) {
     Value *Arg = Cond.first->getOperand(0);
     Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
     if (Cond.second == ICmpInst::ICMP_EQ)
       setConstantInArgument(CS.getInstruction(), NewCI, Arg, ConstVal);
     else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
       assert(Cond.second == ICmpInst::ICMP_NE);
       addNonNullAttribute(CS.getInstruction(), NewCI, Arg);
     }
   }
   return NewCI;
 }
 
 static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
   SmallVector<BasicBlock *, 2> Preds(predecessors((BB)));
   assert(Preds.size() == 2 && "Expected exactly 2 predecessors!");
   return Preds;
 }
 
 static bool canSplitCallSite(CallSite CS) {
   // FIXME: As of now we handle only CallInst. InvokeInst could be handled
   // without too much effort.
   Instruction *Instr = CS.getInstruction();
   if (!isa<CallInst>(Instr))
     return false;
 
   // Allow splitting a call-site only when there is no instruction before the
   // call-site in the basic block. Based on this constraint, we only clone the
   // call instruction, and we do not move a call-site across any other
   // instruction.
   BasicBlock *CallSiteBB = Instr->getParent();
   if (Instr != CallSiteBB->getFirstNonPHIOrDbg())
     return false;
 
   // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
   SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
   if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
       isa<IndirectBrInst>(Preds[1]->getTerminator()))
     return false;
 
   return CallSiteBB->canSplitPredecessors();
 }
 
 /// Return true if the CS is split into its new predecessors which are directly
 /// hooked to each of its original predecessors pointed by PredBB1 and PredBB2.
 /// CallInst1 and CallInst2 will be the new call-sites placed in the new
 /// predecessors split for PredBB1 and PredBB2, respectively.
 /// For example, in the IR below with an OR condition, the call-site can
 /// be split. Assuming PredBB1=Header and PredBB2=TBB, CallInst1 will be the
 /// call-site placed between Header and Tail, and CallInst2 will be the
 /// call-site between TBB and Tail.
 ///
 /// From :
 ///
 ///   Header:
 ///     %c = icmp eq i32* %a, null
 ///     br i1 %c %Tail, %TBB
 ///   TBB:
 ///     %c2 = icmp eq i32* %b, null
 ///     br i1 %c %Tail, %End
 ///   Tail:
 ///     %ca = call i1  @callee (i32* %a, i32* %b)
 ///
 ///  to :
 ///
 ///   Header:                          // PredBB1 is Header
 ///     %c = icmp eq i32* %a, null
 ///     br i1 %c %Tail-split1, %TBB
 ///   TBB:                             // PredBB2 is TBB
 ///     %c2 = icmp eq i32* %b, null
 ///     br i1 %c %Tail-split2, %End
 ///   Tail-split1:
 ///     %ca1 = call @callee (i32* null, i32* %b)         // CallInst1
 ///    br %Tail
 ///   Tail-split2:
 ///     %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
 ///    br %Tail
 ///   Tail:
 ///    %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
 ///
 /// Note that in case any arguments at the call-site are constrained by its
 /// predecessors, new call-sites with more constrained arguments will be
 /// created in createCallSitesOnPredicatedArgument().
 static void splitCallSite(CallSite CS, BasicBlock *PredBB1, BasicBlock *PredBB2,
                           Instruction *CallInst1, Instruction *CallInst2) {
   Instruction *Instr = CS.getInstruction();
   BasicBlock *TailBB = Instr->getParent();
   assert(Instr == (TailBB->getFirstNonPHIOrDbg()) && "Unexpected call-site");
 
   BasicBlock *SplitBlock1 =
       SplitBlockPredecessors(TailBB, PredBB1, ".predBB1.split");
   BasicBlock *SplitBlock2 =
       SplitBlockPredecessors(TailBB, PredBB2, ".predBB2.split");
 
   assert((SplitBlock1 && SplitBlock2) && "Unexpected new basic block split.");
 
   if (!CallInst1)
     CallInst1 = Instr->clone();
   if (!CallInst2)
     CallInst2 = Instr->clone();
 
   CallInst1->insertBefore(&*SplitBlock1->getFirstInsertionPt());
   CallInst2->insertBefore(&*SplitBlock2->getFirstInsertionPt());
 
   CallSite CS1(CallInst1);
   CallSite CS2(CallInst2);
 
   // Handle PHIs used as arguments in the call-site.
   for (PHINode &PN : TailBB->phis()) {
     unsigned ArgNo = 0;
     for (auto &CI : CS.args()) {
       if (&*CI == &PN) {
         CS1.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock1));
         CS2.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock2));
       }
       ++ArgNo;
     }
   }
 
   // Replace users of the original call with a PHI mering call-sites split.
   if (Instr->getNumUses()) {
     PHINode *PN = PHINode::Create(Instr->getType(), 2, "phi.call",
                                   TailBB->getFirstNonPHI());
     PN->addIncoming(CallInst1, SplitBlock1);
     PN->addIncoming(CallInst2, SplitBlock2);
     Instr->replaceAllUsesWith(PN);
   }
   DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
   DEBUG(dbgs() << "    " << *CallInst1 << " in " << SplitBlock1->getName()
                << "\n");
   DEBUG(dbgs() << "    " << *CallInst2 << " in " << SplitBlock2->getName()
                << "\n");
   Instr->eraseFromParent();
   NumCallSiteSplit++;
 }
 
 // Return true if the call-site has an argument which is a PHI with only
 // constant incoming values.
 static bool isPredicatedOnPHI(CallSite CS) {
   Instruction *Instr = CS.getInstruction();
   BasicBlock *Parent = Instr->getParent();
   if (Instr != Parent->getFirstNonPHIOrDbg())
     return false;
 
   for (auto &BI : *Parent) {
     if (PHINode *PN = dyn_cast<PHINode>(&BI)) {
       for (auto &I : CS.args())
         if (&*I == PN) {
           assert(PN->getNumIncomingValues() == 2 &&
                  "Unexpected number of incoming values");
           if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1))
             return false;
           if (PN->getIncomingValue(0) == PN->getIncomingValue(1))
             continue;
           if (isa<Constant>(PN->getIncomingValue(0)) &&
               isa<Constant>(PN->getIncomingValue(1)))
             return true;
         }
     }
     break;
   }
   return false;
 }
 
 static bool tryToSplitOnPHIPredicatedArgument(CallSite CS) {
   if (!isPredicatedOnPHI(CS))
     return false;
 
   auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
   splitCallSite(CS, Preds[0], Preds[1], nullptr, nullptr);
   return true;
 }
 
 static bool tryToSplitOnPredicatedArgument(CallSite CS) {
   auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
   if (Preds[0] == Preds[1])
     return false;
 
   SmallVector<std::pair<ICmpInst *, unsigned>, 2> C1, C2;
   recordConditions(CS, Preds[0], C1);
   recordConditions(CS, Preds[1], C2);
 
   Instruction *CallInst1 = addConditions(CS, C1);
   Instruction *CallInst2 = addConditions(CS, C2);
   if (!CallInst1 && !CallInst2)
     return false;
 
   splitCallSite(CS, Preds[1], Preds[0], CallInst2, CallInst1);
   return true;
 }
 
 static bool tryToSplitCallSite(CallSite CS) {
   if (!CS.arg_size() || !canSplitCallSite(CS))
     return false;
   return tryToSplitOnPredicatedArgument(CS) ||
          tryToSplitOnPHIPredicatedArgument(CS);
 }
 
 static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI) {
   bool Changed = false;
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
     BasicBlock &BB = *BI++;
     for (BasicBlock::iterator II = BB.begin(), IE = BB.end(); II != IE;) {
       Instruction *I = &*II++;
       CallSite CS(cast<Value>(I));
       if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
         continue;
 
       Function *Callee = CS.getCalledFunction();
       if (!Callee || Callee->isDeclaration())
         continue;
       Changed |= tryToSplitCallSite(CS);
     }
   }
   return Changed;
 }
 
 namespace {
 struct CallSiteSplittingLegacyPass : public FunctionPass {
   static char ID;
   CallSiteSplittingLegacyPass() : FunctionPass(ID) {
     initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
 
     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     return doCallSiteSplitting(F, TLI);
   }
 };
 } // namespace
 
 char CallSiteSplittingLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
                       "Call-site splitting", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
                     "Call-site splitting", false, false)
 FunctionPass *llvm::createCallSiteSplittingPass() {
   return new CallSiteSplittingLegacyPass();
 }
 
 PreservedAnalyses CallSiteSplittingPass::run(Function &F,
                                              FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
 
   if (!doCallSiteSplitting(F, TLI))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   return PA;
 }
Index: head/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
===================================================================
--- head/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp	(revision 328753)
@@ -1,899 +1,953 @@
 //===- StructurizeCFG.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 #include <cassert>
 #include <utility>
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "structurizecfg"
 
 // The name for newly created blocks.
 static const char *const FlowBlockName = "Flow";
 
 namespace {
 
 // Definition of the complex types used in this pass.
 
 using BBValuePair = std::pair<BasicBlock *, Value *>;
 
 using RNVector = SmallVector<RegionNode *, 8>;
 using BBVector = SmallVector<BasicBlock *, 8>;
 using BranchVector = SmallVector<BranchInst *, 8>;
 using BBValueVector = SmallVector<BBValuePair, 2>;
 
 using BBSet = SmallPtrSet<BasicBlock *, 8>;
 
 using PhiMap = MapVector<PHINode *, BBValueVector>;
 using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
 
 using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
 using BBPredicates = DenseMap<BasicBlock *, Value *>;
 using PredMap = DenseMap<BasicBlock *, BBPredicates>;
 using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
 
 /// Finds the nearest common dominator of a set of BasicBlocks.
 ///
 /// For every BB you add to the set, you can specify whether we "remember" the
 /// block.  When you get the common dominator, you can also ask whether it's one
 /// of the blocks we remembered.
 class NearestCommonDominator {
   DominatorTree *DT;
   BasicBlock *Result = nullptr;
   bool ResultIsRemembered = false;
 
   /// Add BB to the resulting dominator.
   void addBlock(BasicBlock *BB, bool Remember) {
     if (!Result) {
       Result = BB;
       ResultIsRemembered = Remember;
       return;
     }
 
     BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB);
     if (NewResult != Result)
       ResultIsRemembered = false;
     if (NewResult == BB)
       ResultIsRemembered |= Remember;
     Result = NewResult;
   }
 
 public:
   explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {}
 
   void addBlock(BasicBlock *BB) {
     addBlock(BB, /* Remember = */ false);
   }
 
   void addAndRememberBlock(BasicBlock *BB) {
     addBlock(BB, /* Remember = */ true);
   }
 
   /// Get the nearest common dominator of all the BBs added via addBlock() and
   /// addAndRememberBlock().
   BasicBlock *result() { return Result; }
 
   /// Is the BB returned by getResult() one of the blocks we added to the set
   /// with addAndRememberBlock()?
   bool resultIsRememberedBlock() { return ResultIsRemembered; }
 };
 
 /// @brief Transforms the control flow graph on one single entry/exit region
 /// at a time.
 ///
 /// After the transform all "If"/"Then"/"Else" style control flow looks like
 /// this:
 ///
 /// \verbatim
 /// 1
 /// ||
 /// | |
 /// 2 |
 /// | /
 /// |/
 /// 3
 /// ||   Where:
 /// | |  1 = "If" block, calculates the condition
 /// 4 |  2 = "Then" subregion, runs if the condition is true
 /// | /  3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
 /// |/   4 = "Else" optional subregion, runs if the condition is false
 /// 5    5 = "End" block, also rejoins the control flow
 /// \endverbatim
 ///
 /// Control flow is expressed as a branch where the true exit goes into the
 /// "Then"/"Else" region, while the false exit skips the region
 /// The condition for the optional "Else" region is expressed as a PHI node.
 /// The incoming values of the PHI node are true for the "If" edge and false
 /// for the "Then" edge.
 ///
 /// Additionally to that even complicated loops look like this:
 ///
 /// \verbatim
 /// 1
 /// ||
 /// | |
 /// 2 ^  Where:
 /// | /  1 = "Entry" block
 /// |/   2 = "Loop" optional subregion, with all exits at "Flow" block
 /// 3    3 = "Flow" block, with back edge to entry block
 /// |
 /// \endverbatim
 ///
 /// The back edge of the "Flow" block is always on the false side of the branch
 /// while the true side continues the general flow. So the loop condition
 /// consist of a network of PHI nodes where the true incoming values expresses
 /// breaks and the false values expresses continue states.
 class StructurizeCFG : public RegionPass {
   bool SkipUniformRegions;
 
   Type *Boolean;
   ConstantInt *BoolTrue;
   ConstantInt *BoolFalse;
   UndefValue *BoolUndef;
 
   Function *Func;
   Region *ParentRegion;
 
   DominatorTree *DT;
+  LoopInfo *LI;
 
-  std::deque<RegionNode *> Order;
+  SmallVector<RegionNode *, 8> Order;
   BBSet Visited;
 
   BBPhiMap DeletedPhis;
   BB2BBVecMap AddedPhis;
 
   PredMap Predicates;
   BranchVector Conditions;
 
   BB2BBMap Loops;
   PredMap LoopPreds;
   BranchVector LoopConds;
 
   RegionNode *PrevNode;
 
   void orderNodes();
 
   void analyzeLoops(RegionNode *N);
 
   Value *invert(Value *Condition);
 
   Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
 
   void gatherPredicates(RegionNode *N);
 
-  void analyzeNode(RegionNode *N);
+  void collectInfos();
 
   void insertConditions(bool Loops);
 
   void delPhiValues(BasicBlock *From, BasicBlock *To);
 
   void addPhiValues(BasicBlock *From, BasicBlock *To);
 
   void setPhiValues();
 
   void killTerminator(BasicBlock *BB);
 
   void changeExit(RegionNode *Node, BasicBlock *NewExit,
                   bool IncludeDominator);
 
   BasicBlock *getNextFlow(BasicBlock *Dominator);
 
   BasicBlock *needPrefix(bool NeedEmpty);
 
   BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
 
   void setPrevNode(BasicBlock *BB);
 
   bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
 
   bool isPredictableTrue(RegionNode *Node);
 
   void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
 
   void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
 
   void createFlow();
 
   void rebuildSSA();
 
 public:
   static char ID;
 
   explicit StructurizeCFG(bool SkipUniformRegions = false)
       : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
     initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
   }
 
   bool doInitialization(Region *R, RGPassManager &RGM) override;
 
   bool runOnRegion(Region *R, RGPassManager &RGM) override;
 
   StringRef getPassName() const override { return "Structurize control flow"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     if (SkipUniformRegions)
       AU.addRequired<DivergenceAnalysis>();
     AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
   }
 };
 
 } // end anonymous namespace
 
 char StructurizeCFG::ID = 0;
 
 INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
 INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
                     false, false)
 
 /// \brief Initialize the types and constants used in the pass
 bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
   LLVMContext &Context = R->getEntry()->getContext();
 
   Boolean = Type::getInt1Ty(Context);
   BoolTrue = ConstantInt::getTrue(Context);
   BoolFalse = ConstantInt::getFalse(Context);
   BoolUndef = UndefValue::get(Boolean);
 
   return false;
 }
 
 /// \brief Build up the general order of nodes
 void StructurizeCFG::orderNodes() {
-  assert(Visited.empty());
-  assert(Predicates.empty());
-  assert(Loops.empty());
-  assert(LoopPreds.empty());
+  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
+  SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
 
-  // This must be RPO order for the back edge detection to work
-  for (RegionNode *RN : ReversePostOrderTraversal<Region*>(ParentRegion)) {
-    // FIXME: Is there a better order to use for structurization?
-    Order.push_back(RN);
-    analyzeNode(RN);
+  // The reverse post-order traversal of the list gives us an ordering close
+  // to what we want.  The only problem with it is that sometimes backedges
+  // for outer loops will be visited before backedges for inner loops.
+  for (RegionNode *RN : RPOT) {
+    BasicBlock *BB = RN->getEntry();
+    Loop *Loop = LI->getLoopFor(BB);
+    ++LoopBlocks[Loop];
   }
+
+  unsigned CurrentLoopDepth = 0;
+  Loop *CurrentLoop = nullptr;
+  for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = (*I)->getEntry();
+    unsigned LoopDepth = LI->getLoopDepth(BB);
+
+    if (is_contained(Order, *I))
+      continue;
+
+    if (LoopDepth < CurrentLoopDepth) {
+      // Make sure we have visited all blocks in this loop before moving back to
+      // the outer loop.
+
+      auto LoopI = I;
+      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
+        LoopI++;
+        BasicBlock *LoopBB = (*LoopI)->getEntry();
+        if (LI->getLoopFor(LoopBB) == CurrentLoop) {
+          --BlockCount;
+          Order.push_back(*LoopI);
+        }
+      }
+    }
+
+    CurrentLoop = LI->getLoopFor(BB);
+    if (CurrentLoop)
+      LoopBlocks[CurrentLoop]--;
+
+    CurrentLoopDepth = LoopDepth;
+    Order.push_back(*I);
+  }
+
+  // This pass originally used a post-order traversal and then operated on
+  // the list in reverse. Now that we are using a reverse post-order traversal
+  // rather than re-working the whole pass to operate on the list in order,
+  // we just reverse the list and continue to operate on it in reverse.
+  std::reverse(Order.begin(), Order.end());
 }
 
 /// \brief Determine the end of the loops
 void StructurizeCFG::analyzeLoops(RegionNode *N) {
   if (N->isSubRegion()) {
     // Test for exit as back edge
     BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
     if (Visited.count(Exit))
       Loops[Exit] = N->getEntry();
 
   } else {
     // Test for successors as back edge
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
     BranchInst *Term = cast<BranchInst>(BB->getTerminator());
 
     for (BasicBlock *Succ : Term->successors())
       if (Visited.count(Succ))
         Loops[Succ] = BB;
   }
 }
 
 /// \brief Invert the given condition
 Value *StructurizeCFG::invert(Value *Condition) {
   // First: Check if it's a constant
   if (Constant *C = dyn_cast<Constant>(Condition))
     return ConstantExpr::getNot(C);
 
   // Second: If the condition is already inverted, return the original value
   if (match(Condition, m_Not(m_Value(Condition))))
     return Condition;
 
   if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
     // Third: Check all the users for an invert
     BasicBlock *Parent = Inst->getParent();
     for (User *U : Condition->users())
       if (Instruction *I = dyn_cast<Instruction>(U))
         if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
           return I;
 
     // Last option: Create a new instruction
     return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
   }
 
   if (Argument *Arg = dyn_cast<Argument>(Condition)) {
     BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
     return BinaryOperator::CreateNot(Condition,
                                      Arg->getName() + ".inv",
                                      EntryBlock.getTerminator());
   }
 
   llvm_unreachable("Unhandled condition to invert");
 }
 
 /// \brief Build the condition for one edge
 Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
                                       bool Invert) {
   Value *Cond = Invert ? BoolFalse : BoolTrue;
   if (Term->isConditional()) {
     Cond = Term->getCondition();
 
     if (Idx != (unsigned)Invert)
       Cond = invert(Cond);
   }
   return Cond;
 }
 
 /// \brief Analyze the predecessors of each block and build up predicates
 void StructurizeCFG::gatherPredicates(RegionNode *N) {
   RegionInfo *RI = ParentRegion->getRegionInfo();
   BasicBlock *BB = N->getEntry();
   BBPredicates &Pred = Predicates[BB];
   BBPredicates &LPred = LoopPreds[BB];
 
   for (BasicBlock *P : predecessors(BB)) {
     // Ignore it if it's a branch from outside into our region entry
     if (!ParentRegion->contains(P))
       continue;
 
     Region *R = RI->getRegionFor(P);
     if (R == ParentRegion) {
       // It's a top level block in our region
       BranchInst *Term = cast<BranchInst>(P->getTerminator());
       for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
         BasicBlock *Succ = Term->getSuccessor(i);
         if (Succ != BB)
           continue;
 
         if (Visited.count(P)) {
           // Normal forward edge
           if (Term->isConditional()) {
             // Try to treat it like an ELSE block
             BasicBlock *Other = Term->getSuccessor(!i);
             if (Visited.count(Other) && !Loops.count(Other) &&
                 !Pred.count(Other) && !Pred.count(P)) {
 
               Pred[Other] = BoolFalse;
               Pred[P] = BoolTrue;
               continue;
             }
           }
           Pred[P] = buildCondition(Term, i, false);
         } else {
           // Back edge
           LPred[P] = buildCondition(Term, i, true);
         }
       }
     } else {
       // It's an exit from a sub region
       while (R->getParent() != ParentRegion)
         R = R->getParent();
 
       // Edge from inside a subregion to its entry, ignore it
       if (*R == *N)
         continue;
 
       BasicBlock *Entry = R->getEntry();
       if (Visited.count(Entry))
         Pred[Entry] = BoolTrue;
       else
         LPred[Entry] = BoolFalse;
     }
   }
 }
 
 /// \brief Collect various loop and predicate infos
-void StructurizeCFG::analyzeNode(RegionNode *RN) {
-  DEBUG(dbgs() << "Visiting: "
-        << (RN->isSubRegion() ? "SubRegion with entry: " : "")
-        << RN->getEntry()->getName() << '\n');
+void StructurizeCFG::collectInfos() {
+  // Reset predicate
+  Predicates.clear();
 
-  // Analyze all the conditions leading to a node
-  gatherPredicates(RN);
+  // and loop infos
+  Loops.clear();
+  LoopPreds.clear();
 
-  // Remember that we've seen this node
-  Visited.insert(RN->getEntry());
+  // Reset the visited nodes
+  Visited.clear();
 
-  // Find the last back edges
-  analyzeLoops(RN);
+  for (RegionNode *RN : reverse(Order)) {
+    DEBUG(dbgs() << "Visiting: "
+                 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+                 << RN->getEntry()->getName() << " Loop Depth: "
+                 << LI->getLoopDepth(RN->getEntry()) << "\n");
+
+    // Analyze all the conditions leading to a node
+    gatherPredicates(RN);
+
+    // Remember that we've seen this node
+    Visited.insert(RN->getEntry());
+
+    // Find the last back edges
+    analyzeLoops(RN);
+  }
 }
 
 /// \brief Insert the missing branch conditions
 void StructurizeCFG::insertConditions(bool Loops) {
   BranchVector &Conds = Loops ? LoopConds : Conditions;
   Value *Default = Loops ? BoolTrue : BoolFalse;
   SSAUpdater PhiInserter;
 
   for (BranchInst *Term : Conds) {
     assert(Term->isConditional());
 
     BasicBlock *Parent = Term->getParent();
     BasicBlock *SuccTrue = Term->getSuccessor(0);
     BasicBlock *SuccFalse = Term->getSuccessor(1);
 
     PhiInserter.Initialize(Boolean, "");
     PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
     PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
 
     BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
 
     NearestCommonDominator Dominator(DT);
     Dominator.addBlock(Parent);
 
     Value *ParentValue = nullptr;
     for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
       BasicBlock *BB = BBAndPred.first;
       Value *Pred = BBAndPred.second;
 
       if (BB == Parent) {
         ParentValue = Pred;
         break;
       }
       PhiInserter.AddAvailableValue(BB, Pred);
       Dominator.addAndRememberBlock(BB);
     }
 
     if (ParentValue) {
       Term->setCondition(ParentValue);
     } else {
       if (!Dominator.resultIsRememberedBlock())
         PhiInserter.AddAvailableValue(Dominator.result(), Default);
 
       Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
     }
   }
 }
 
 /// \brief Remove all PHI values coming from "From" into "To" and remember
 /// them in DeletedPhis
 void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
   PhiMap &Map = DeletedPhis[To];
   for (PHINode &Phi : To->phis()) {
     while (Phi.getBasicBlockIndex(From) != -1) {
       Value *Deleted = Phi.removeIncomingValue(From, false);
       Map[&Phi].push_back(std::make_pair(From, Deleted));
     }
   }
 }
 
 /// \brief Add a dummy PHI value as soon as we knew the new predecessor
 void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
   for (PHINode &Phi : To->phis()) {
     Value *Undef = UndefValue::get(Phi.getType());
     Phi.addIncoming(Undef, From);
   }
   AddedPhis[To].push_back(From);
 }
 
 /// \brief Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
   SSAUpdater Updater;
   for (const auto &AddedPhi : AddedPhis) {
     BasicBlock *To = AddedPhi.first;
     const BBVector &From = AddedPhi.second;
 
     if (!DeletedPhis.count(To))
       continue;
 
     PhiMap &Map = DeletedPhis[To];
     for (const auto &PI : Map) {
       PHINode *Phi = PI.first;
       Value *Undef = UndefValue::get(Phi->getType());
       Updater.Initialize(Phi->getType(), "");
       Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
       Updater.AddAvailableValue(To, Undef);
 
       NearestCommonDominator Dominator(DT);
       Dominator.addBlock(To);
       for (const auto &VI : PI.second) {
         Updater.AddAvailableValue(VI.first, VI.second);
         Dominator.addAndRememberBlock(VI.first);
       }
 
       if (!Dominator.resultIsRememberedBlock())
         Updater.AddAvailableValue(Dominator.result(), Undef);
 
       for (BasicBlock *FI : From) {
         int Idx = Phi->getBasicBlockIndex(FI);
         assert(Idx != -1);
         Phi->setIncomingValue(Idx, Updater.GetValueAtEndOfBlock(FI));
       }
     }
 
     DeletedPhis.erase(To);
   }
   assert(DeletedPhis.empty());
 }
 
 /// \brief Remove phi values from all successors and then remove the terminator.
 void StructurizeCFG::killTerminator(BasicBlock *BB) {
   TerminatorInst *Term = BB->getTerminator();
   if (!Term)
     return;
 
   for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
        SI != SE; ++SI)
     delPhiValues(BB, *SI);
 
   Term->eraseFromParent();
 }
 
 /// \brief Let node exit(s) point to NewExit
 void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
                                 bool IncludeDominator) {
   if (Node->isSubRegion()) {
     Region *SubRegion = Node->getNodeAs<Region>();
     BasicBlock *OldExit = SubRegion->getExit();
     BasicBlock *Dominator = nullptr;
 
     // Find all the edges from the sub region to the exit
     for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) {
       // Incrememt BBI before mucking with BB's terminator.
       BasicBlock *BB = *BBI++;
 
       if (!SubRegion->contains(BB))
         continue;
 
       // Modify the edges to point to the new exit
       delPhiValues(BB, OldExit);
       BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
       addPhiValues(BB, NewExit);
 
       // Find the new dominator (if requested)
       if (IncludeDominator) {
         if (!Dominator)
           Dominator = BB;
         else
           Dominator = DT->findNearestCommonDominator(Dominator, BB);
       }
     }
 
     // Change the dominator (if requested)
     if (Dominator)
       DT->changeImmediateDominator(NewExit, Dominator);
 
     // Update the region info
     SubRegion->replaceExit(NewExit);
   } else {
     BasicBlock *BB = Node->getNodeAs<BasicBlock>();
     killTerminator(BB);
     BranchInst::Create(NewExit, BB);
     addPhiValues(BB, NewExit);
     if (IncludeDominator)
       DT->changeImmediateDominator(NewExit, BB);
   }
 }
 
 /// \brief Create a new flow node and update dominator tree and region info
 BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
   LLVMContext &Context = Func->getContext();
   BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
-                       Order.front()->getEntry();
+                       Order.back()->getEntry();
   BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
                                         Func, Insert);
   DT->addNewBlock(Flow, Dominator);
   ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
   return Flow;
 }
 
 /// \brief Create a new or reuse the previous node as flow node
 BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
   BasicBlock *Entry = PrevNode->getEntry();
 
   if (!PrevNode->isSubRegion()) {
     killTerminator(Entry);
     if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
       return Entry;
   }
 
   // create a new flow node
   BasicBlock *Flow = getNextFlow(Entry);
 
   // and wire it up
   changeExit(PrevNode, Flow, true);
   PrevNode = ParentRegion->getBBNode(Flow);
   return Flow;
 }
 
 /// \brief Returns the region exit if possible, otherwise just a new flow node
 BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
                                         bool ExitUseAllowed) {
   if (!Order.empty() || !ExitUseAllowed)
     return getNextFlow(Flow);
 
   BasicBlock *Exit = ParentRegion->getExit();
   DT->changeImmediateDominator(Exit, Flow);
   addPhiValues(Flow, Exit);
   return Exit;
 }
 
 /// \brief Set the previous node
 void StructurizeCFG::setPrevNode(BasicBlock *BB) {
   PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
                                         : nullptr;
 }
 
 /// \brief Does BB dominate all the predicates of Node?
 bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
   BBPredicates &Preds = Predicates[Node->getEntry()];
   return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
     return DT->dominates(BB, Pred.first);
   });
 }
 
 /// \brief Can we predict that this node will always be called?
 bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
   BBPredicates &Preds = Predicates[Node->getEntry()];
   bool Dominated = false;
 
   // Regionentry is always true
   if (!PrevNode)
     return true;
 
   for (std::pair<BasicBlock*, Value*> Pred : Preds) {
     BasicBlock *BB = Pred.first;
     Value *V = Pred.second;
 
     if (V != BoolTrue)
       return false;
 
     if (!Dominated && DT->dominates(BB, PrevNode->getEntry()))
       Dominated = true;
   }
 
   // TODO: The dominator check is too strict
   return Dominated;
 }
 
 /// Take one node from the order vector and wire it up
 void StructurizeCFG::wireFlow(bool ExitUseAllowed,
                               BasicBlock *LoopEnd) {
-  RegionNode *Node = Order.front();
-  Order.pop_front();
+  RegionNode *Node = Order.pop_back_val();
   Visited.insert(Node->getEntry());
 
   if (isPredictableTrue(Node)) {
     // Just a linear flow
     if (PrevNode) {
       changeExit(PrevNode, Node->getEntry(), true);
     }
     PrevNode = Node;
   } else {
     // Insert extra prefix node (or reuse last one)
     BasicBlock *Flow = needPrefix(false);
 
     // Insert extra postfix node (or use exit instead)
     BasicBlock *Entry = Node->getEntry();
     BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
 
     // let it point to entry and next block
     Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
     addPhiValues(Flow, Entry);
     DT->changeImmediateDominator(Entry, Flow);
 
     PrevNode = Node;
     while (!Order.empty() && !Visited.count(LoopEnd) &&
-           dominatesPredicates(Entry, Order.front())) {
+           dominatesPredicates(Entry, Order.back())) {
       handleLoops(false, LoopEnd);
     }
 
     changeExit(PrevNode, Next, false);
     setPrevNode(Next);
   }
 }
 
 void StructurizeCFG::handleLoops(bool ExitUseAllowed,
                                  BasicBlock *LoopEnd) {
-  RegionNode *Node = Order.front();
+  RegionNode *Node = Order.back();
   BasicBlock *LoopStart = Node->getEntry();
 
   if (!Loops.count(LoopStart)) {
     wireFlow(ExitUseAllowed, LoopEnd);
     return;
   }
 
   if (!isPredictableTrue(Node))
     LoopStart = needPrefix(true);
 
   LoopEnd = Loops[Node->getEntry()];
   wireFlow(false, LoopEnd);
   while (!Visited.count(LoopEnd)) {
     handleLoops(false, LoopEnd);
   }
 
   // If the start of the loop is the entry block, we can't branch to it so
   // insert a new dummy entry block.
   Function *LoopFunc = LoopStart->getParent();
   if (LoopStart == &LoopFunc->getEntryBlock()) {
     LoopStart->setName("entry.orig");
 
     BasicBlock *NewEntry =
       BasicBlock::Create(LoopStart->getContext(),
                          "entry",
                          LoopFunc,
                          LoopStart);
     BranchInst::Create(LoopStart, NewEntry);
     DT->setNewRoot(NewEntry);
   }
 
   // Create an extra loop end node
   LoopEnd = needPrefix(false);
   BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
   LoopConds.push_back(BranchInst::Create(Next, LoopStart,
                                          BoolUndef, LoopEnd));
   addPhiValues(LoopEnd, LoopStart);
   setPrevNode(Next);
 }
 
 /// After this function control flow looks like it should be, but
 /// branches and PHI nodes only have undefined conditions.
 void StructurizeCFG::createFlow() {
   BasicBlock *Exit = ParentRegion->getExit();
   bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
 
   DeletedPhis.clear();
   AddedPhis.clear();
   Conditions.clear();
   LoopConds.clear();
 
   PrevNode = nullptr;
   Visited.clear();
 
   while (!Order.empty()) {
     handleLoops(EntryDominatesExit, nullptr);
   }
 
   if (PrevNode)
     changeExit(PrevNode, Exit, EntryDominatesExit);
   else
     assert(EntryDominatesExit);
 }
 
 /// Handle a rare case where the disintegrated nodes instructions
 /// no longer dominate all their uses. Not sure if this is really nessasary
 void StructurizeCFG::rebuildSSA() {
   SSAUpdater Updater;
   for (BasicBlock *BB : ParentRegion->blocks())
     for (Instruction &I : *BB) {
       bool Initialized = false;
       // We may modify the use list as we iterate over it, so be careful to
       // compute the next element in the use list at the top of the loop.
       for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) {
         Use &U = *UI++;
         Instruction *User = cast<Instruction>(U.getUser());
         if (User->getParent() == BB) {
           continue;
         } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
           if (UserPN->getIncomingBlock(U) == BB)
             continue;
         }
 
         if (DT->dominates(&I, User))
           continue;
 
         if (!Initialized) {
           Value *Undef = UndefValue::get(I.getType());
           Updater.Initialize(I.getType(), "");
           Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
           Updater.AddAvailableValue(BB, &I);
           Initialized = true;
         }
         Updater.RewriteUseAfterInsertions(U);
       }
     }
 }
 
 static bool hasOnlyUniformBranches(const Region *R,
                                    const DivergenceAnalysis &DA) {
   for (const BasicBlock *BB : R->blocks()) {
     const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
     if (!Br || !Br->isConditional())
       continue;
 
     if (!DA.isUniform(Br->getCondition()))
       return false;
     DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
   }
   return true;
 }
 
 /// \brief Run the transformation for each region found
 bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   if (R->isTopLevelRegion())
     return false;
 
   if (SkipUniformRegions) {
     // TODO: We could probably be smarter here with how we handle sub-regions.
     auto &DA = getAnalysis<DivergenceAnalysis>();
     if (hasOnlyUniformBranches(R, DA)) {
       DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
 
       // Mark all direct child block terminators as having been treated as
       // uniform. To account for a possible future in which non-uniform
       // sub-regions are treated more cleverly, indirect children are not
       // marked as uniform.
       MDNode *MD = MDNode::get(R->getEntry()->getParent()->getContext(), {});
       for (RegionNode *E : R->elements()) {
         if (E->isSubRegion())
           continue;
 
         if (Instruction *Term = E->getEntry()->getTerminator())
           Term->setMetadata("structurizecfg.uniform", MD);
       }
 
       return false;
     }
   }
 
   Func = R->getEntry()->getParent();
   ParentRegion = R;
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
   orderNodes();
-
+  collectInfos();
   createFlow();
   insertConditions(false);
   insertConditions(true);
   setPhiValues();
   rebuildSSA();
 
   // Cleanup
   Order.clear();
   Visited.clear();
   DeletedPhis.clear();
   AddedPhis.clear();
   Predicates.clear();
   Conditions.clear();
   Loops.clear();
   LoopPreds.clear();
   LoopConds.clear();
 
   return true;
 }
 
 Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
   return new StructurizeCFG(SkipUniformRegions);
 }
Index: head/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp
===================================================================
--- head/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp	(revision 328752)
+++ head/contrib/llvm/lib/Transforms/Utils/ValueMapper.cpp	(revision 328753)
@@ -1,1132 +1,1143 @@
 //===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the MapValue function, which is shared by various parts of
 // the lib/Transforms/Utils library.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
 #include <limits>
 #include <memory>
 #include <utility>
 
 using namespace llvm;
 
 // Out of line method to get vtable etc for class.
 void ValueMapTypeRemapper::anchor() {}
 void ValueMaterializer::anchor() {}
 
 namespace {
 
 /// A basic block used in a BlockAddress whose function body is not yet
 /// materialized.
 struct DelayedBasicBlock {
   BasicBlock *OldBB;
   std::unique_ptr<BasicBlock> TempBB;
 
   DelayedBasicBlock(const BlockAddress &Old)
       : OldBB(Old.getBasicBlock()),
         TempBB(BasicBlock::Create(Old.getContext())) {}
 };
 
 struct WorklistEntry {
   enum EntryKind {
     MapGlobalInit,
     MapAppendingVar,
     MapGlobalAliasee,
     RemapFunction
   };
   struct GVInitTy {
     GlobalVariable *GV;
     Constant *Init;
   };
   struct AppendingGVTy {
     GlobalVariable *GV;
     Constant *InitPrefix;
   };
   struct GlobalAliaseeTy {
     GlobalAlias *GA;
     Constant *Aliasee;
   };
 
   unsigned Kind : 2;
   unsigned MCID : 29;
   unsigned AppendingGVIsOldCtorDtor : 1;
   unsigned AppendingGVNumNewMembers;
   union {
     GVInitTy GVInit;
     AppendingGVTy AppendingGV;
     GlobalAliaseeTy GlobalAliasee;
     Function *RemapF;
   } Data;
 };
 
 struct MappingContext {
   ValueToValueMapTy *VM;
   ValueMaterializer *Materializer = nullptr;
 
   /// Construct a MappingContext with a value map and materializer.
   explicit MappingContext(ValueToValueMapTy &VM,
                           ValueMaterializer *Materializer = nullptr)
       : VM(&VM), Materializer(Materializer) {}
 };
 
 class Mapper {
   friend class MDNodeMapper;
 
 #ifndef NDEBUG
   DenseSet<GlobalValue *> AlreadyScheduled;
 #endif
 
   RemapFlags Flags;
   ValueMapTypeRemapper *TypeMapper;
   unsigned CurrentMCID = 0;
   SmallVector<MappingContext, 2> MCs;
   SmallVector<WorklistEntry, 4> Worklist;
   SmallVector<DelayedBasicBlock, 1> DelayedBBs;
   SmallVector<Constant *, 16> AppendingInits;
 
 public:
   Mapper(ValueToValueMapTy &VM, RemapFlags Flags,
          ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer)
       : Flags(Flags), TypeMapper(TypeMapper),
         MCs(1, MappingContext(VM, Materializer)) {}
 
   /// ValueMapper should explicitly call \a flush() before destruction.
   ~Mapper() { assert(!hasWorkToDo() && "Expected to be flushed"); }
 
   bool hasWorkToDo() const { return !Worklist.empty(); }
 
   unsigned
   registerAlternateMappingContext(ValueToValueMapTy &VM,
                                   ValueMaterializer *Materializer = nullptr) {
     MCs.push_back(MappingContext(VM, Materializer));
     return MCs.size() - 1;
   }
 
   void addFlags(RemapFlags Flags);
 
   void remapGlobalObjectMetadata(GlobalObject &GO);
 
   Value *mapValue(const Value *V);
   void remapInstruction(Instruction *I);
   void remapFunction(Function &F);
 
   Constant *mapConstant(const Constant *C) {
     return cast_or_null<Constant>(mapValue(C));
   }
 
   /// Map metadata.
   ///
   /// Find the mapping for MD.  Guarantees that the return will be resolved
   /// (not an MDNode, or MDNode::isResolved() returns true).
   Metadata *mapMetadata(const Metadata *MD);
 
   void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
                                     unsigned MCID);
   void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
                                     bool IsOldCtorDtor,
                                     ArrayRef<Constant *> NewMembers,
                                     unsigned MCID);
   void scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
                                 unsigned MCID);
   void scheduleRemapFunction(Function &F, unsigned MCID);
 
   void flush();
 
 private:
   void mapGlobalInitializer(GlobalVariable &GV, Constant &Init);
   void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
                             bool IsOldCtorDtor,
                             ArrayRef<Constant *> NewMembers);
   void mapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee);
   void remapFunction(Function &F, ValueToValueMapTy &VM);
 
   ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
   ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; }
 
   Value *mapBlockAddress(const BlockAddress &BA);
 
   /// Map metadata that doesn't require visiting operands.
   Optional<Metadata *> mapSimpleMetadata(const Metadata *MD);
 
   Metadata *mapToMetadata(const Metadata *Key, Metadata *Val);
   Metadata *mapToSelf(const Metadata *MD);
 };
 
 class MDNodeMapper {
   Mapper &M;
 
   /// Data about a node in \a UniquedGraph.
   struct Data {
     bool HasChanged = false;
     unsigned ID = std::numeric_limits<unsigned>::max();
     TempMDNode Placeholder;
   };
 
   /// A graph of uniqued nodes.
   struct UniquedGraph {
     SmallDenseMap<const Metadata *, Data, 32> Info; // Node properties.
     SmallVector<MDNode *, 16> POT;                  // Post-order traversal.
 
     /// Propagate changed operands through the post-order traversal.
     ///
     /// Iteratively update \a Data::HasChanged for each node based on \a
     /// Data::HasChanged of its operands, until fixed point.
     void propagateChanges();
 
     /// Get a forward reference to a node to use as an operand.
     Metadata &getFwdReference(MDNode &Op);
   };
 
   /// Worklist of distinct nodes whose operands need to be remapped.
   SmallVector<MDNode *, 16> DistinctWorklist;
 
   // Storage for a UniquedGraph.
   SmallDenseMap<const Metadata *, Data, 32> InfoStorage;
   SmallVector<MDNode *, 16> POTStorage;
 
 public:
   MDNodeMapper(Mapper &M) : M(M) {}
 
   /// Map a metadata node (and its transitive operands).
   ///
   /// Map all the (unmapped) nodes in the subgraph under \c N.  The iterative
   /// algorithm handles distinct nodes and uniqued node subgraphs using
   /// different strategies.
   ///
   /// Distinct nodes are immediately mapped and added to \a DistinctWorklist
   /// using \a mapDistinctNode().  Their mapping can always be computed
   /// immediately without visiting operands, even if their operands change.
   ///
   /// The mapping for uniqued nodes depends on whether their operands change.
   /// \a mapTopLevelUniquedNode() traverses the transitive uniqued subgraph of
   /// a node to calculate uniqued node mappings in bulk.  Distinct leafs are
   /// added to \a DistinctWorklist with \a mapDistinctNode().
   ///
   /// After mapping \c N itself, this function remaps the operands of the
   /// distinct nodes in \a DistinctWorklist until the entire subgraph under \c
   /// N has been mapped.
   Metadata *map(const MDNode &N);
 
 private:
   /// Map a top-level uniqued node and the uniqued subgraph underneath it.
   ///
   /// This builds up a post-order traversal of the (unmapped) uniqued subgraph
   /// underneath \c FirstN and calculates the nodes' mapping.  Each node uses
   /// the identity mapping (\a Mapper::mapToSelf()) as long as all of its
   /// operands uses the identity mapping.
   ///
   /// The algorithm works as follows:
   ///
   ///  1. \a createPOT(): traverse the uniqued subgraph under \c FirstN and
   ///     save the post-order traversal in the given \a UniquedGraph, tracking
   ///     nodes' operands change.
   ///
   ///  2. \a UniquedGraph::propagateChanges(): propagate changed operands
   ///     through the \a UniquedGraph until fixed point, following the rule
   ///     that if a node changes, any node that references must also change.
   ///
   ///  3. \a mapNodesInPOT(): map the uniqued nodes, creating new uniqued nodes
   ///     (referencing new operands) where necessary.
   Metadata *mapTopLevelUniquedNode(const MDNode &FirstN);
 
   /// Try to map the operand of an \a MDNode.
   ///
   /// If \c Op is already mapped, return the mapping.  If it's not an \a
   /// MDNode, compute and return the mapping.  If it's a distinct \a MDNode,
   /// return the result of \a mapDistinctNode().
   ///
   /// \return None if \c Op is an unmapped uniqued \a MDNode.
   /// \post getMappedOp(Op) only returns None if this returns None.
   Optional<Metadata *> tryToMapOperand(const Metadata *Op);
 
   /// Map a distinct node.
   ///
   /// Return the mapping for the distinct node \c N, saving the result in \a
   /// DistinctWorklist for later remapping.
   ///
   /// \pre \c N is not yet mapped.
   /// \pre \c N.isDistinct().
   MDNode *mapDistinctNode(const MDNode &N);
 
   /// Get a previously mapped node.
   Optional<Metadata *> getMappedOp(const Metadata *Op) const;
 
   /// Create a post-order traversal of an unmapped uniqued node subgraph.
   ///
   /// This traverses the metadata graph deeply enough to map \c FirstN.  It
   /// uses \a tryToMapOperand() (via \a Mapper::mapSimplifiedNode()), so any
   /// metadata that has already been mapped will not be part of the POT.
   ///
   /// Each node that has a changed operand from outside the graph (e.g., a
   /// distinct node, an already-mapped uniqued node, or \a ConstantAsMetadata)
   /// is marked with \a Data::HasChanged.
   ///
   /// \return \c true if any nodes in \c G have \a Data::HasChanged.
   /// \post \c G.POT is a post-order traversal ending with \c FirstN.
   /// \post \a Data::hasChanged in \c G.Info indicates whether any node needs
   /// to change because of operands outside the graph.
   bool createPOT(UniquedGraph &G, const MDNode &FirstN);
 
   /// Visit the operands of a uniqued node in the POT.
   ///
   /// Visit the operands in the range from \c I to \c E, returning the first
   /// uniqued node we find that isn't yet in \c G.  \c I is always advanced to
   /// where to continue the loop through the operands.
   ///
   /// This sets \c HasChanged if any of the visited operands change.
   MDNode *visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
                         MDNode::op_iterator E, bool &HasChanged);
 
   /// Map all the nodes in the given uniqued graph.
   ///
   /// This visits all the nodes in \c G in post-order, using the identity
   /// mapping or creating a new node depending on \a Data::HasChanged.
   ///
   /// \pre \a getMappedOp() returns None for nodes in \c G, but not for any of
   /// their operands outside of \c G.
   /// \pre \a Data::HasChanged is true for a node in \c G iff any of its
   /// operands have changed.
   /// \post \a getMappedOp() returns the mapped node for every node in \c G.
   void mapNodesInPOT(UniquedGraph &G);
 
   /// Remap a node's operands using the given functor.
   ///
   /// Iterate through the operands of \c N and update them in place using \c
   /// mapOperand.
   ///
   /// \pre N.isDistinct() or N.isTemporary().
   template <class OperandMapper>
   void remapOperands(MDNode &N, OperandMapper mapOperand);
 };
 
 } // end anonymous namespace
 
 Value *Mapper::mapValue(const Value *V) {
   ValueToValueMapTy::iterator I = getVM().find(V);
 
   // If the value already exists in the map, use it.
   if (I != getVM().end()) {
     assert(I->second && "Unexpected null mapping");
     return I->second;
   }
 
   // If we have a materializer and it can materialize a value, use that.
   if (auto *Materializer = getMaterializer()) {
     if (Value *NewV = Materializer->materialize(const_cast<Value *>(V))) {
       getVM()[V] = NewV;
       return NewV;
     }
   }
 
   // Global values do not need to be seeded into the VM if they
   // are using the identity mapping.
   if (isa<GlobalValue>(V)) {
     if (Flags & RF_NullMapMissingGlobalValues)
       return nullptr;
     return getVM()[V] = const_cast<Value *>(V);
   }
 
   if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
     // Inline asm may need *type* remapping.
     FunctionType *NewTy = IA->getFunctionType();
     if (TypeMapper) {
       NewTy = cast<FunctionType>(TypeMapper->remapType(NewTy));
 
       if (NewTy != IA->getFunctionType())
         V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
                            IA->hasSideEffects(), IA->isAlignStack());
     }
 
     return getVM()[V] = const_cast<Value *>(V);
   }
 
   if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) {
     const Metadata *MD = MDV->getMetadata();
 
     if (auto *LAM = dyn_cast<LocalAsMetadata>(MD)) {
       // Look through to grab the local value.
       if (Value *LV = mapValue(LAM->getValue())) {
         if (V == LAM->getValue())
           return const_cast<Value *>(V);
         return MetadataAsValue::get(V->getContext(), ValueAsMetadata::get(LV));
       }
 
       // FIXME: always return nullptr once Verifier::verifyDominatesUse()
       // ensures metadata operands only reference defined SSA values.
       return (Flags & RF_IgnoreMissingLocals)
                  ? nullptr
                  : MetadataAsValue::get(V->getContext(),
                                         MDTuple::get(V->getContext(), None));
     }
 
     // If this is a module-level metadata and we know that nothing at the module
     // level is changing, then use an identity mapping.
     if (Flags & RF_NoModuleLevelChanges)
       return getVM()[V] = const_cast<Value *>(V);
 
     // Map the metadata and turn it into a value.
     auto *MappedMD = mapMetadata(MD);
     if (MD == MappedMD)
       return getVM()[V] = const_cast<Value *>(V);
     return getVM()[V] = MetadataAsValue::get(V->getContext(), MappedMD);
   }
 
   // Okay, this either must be a constant (which may or may not be mappable) or
   // is something that is not in the mapping table.
   Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
   if (!C)
     return nullptr;
 
   if (BlockAddress *BA = dyn_cast<BlockAddress>(C))
     return mapBlockAddress(*BA);
 
   auto mapValueOrNull = [this](Value *V) {
     auto Mapped = mapValue(V);
     assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
            "Unexpected null mapping for constant operand without "
            "NullMapMissingGlobalValues flag");
     return Mapped;
   };
 
   // Otherwise, we have some other constant to remap.  Start by checking to see
   // if all operands have an identity remapping.
   unsigned OpNo = 0, NumOperands = C->getNumOperands();
   Value *Mapped = nullptr;
   for (; OpNo != NumOperands; ++OpNo) {
     Value *Op = C->getOperand(OpNo);
     Mapped = mapValueOrNull(Op);
     if (!Mapped)
       return nullptr;
     if (Mapped != Op)
       break;
   }
 
   // See if the type mapper wants to remap the type as well.
   Type *NewTy = C->getType();
   if (TypeMapper)
     NewTy = TypeMapper->remapType(NewTy);
 
   // If the result type and all operands match up, then just insert an identity
   // mapping.
   if (OpNo == NumOperands && NewTy == C->getType())
     return getVM()[V] = C;
 
   // Okay, we need to create a new constant.  We've already processed some or
   // all of the operands, set them all up now.
   SmallVector<Constant*, 8> Ops;
   Ops.reserve(NumOperands);
   for (unsigned j = 0; j != OpNo; ++j)
     Ops.push_back(cast<Constant>(C->getOperand(j)));
 
   // If one of the operands mismatch, push it and the other mapped operands.
   if (OpNo != NumOperands) {
     Ops.push_back(cast<Constant>(Mapped));
 
     // Map the rest of the operands that aren't processed yet.
     for (++OpNo; OpNo != NumOperands; ++OpNo) {
       Mapped = mapValueOrNull(C->getOperand(OpNo));
       if (!Mapped)
         return nullptr;
       Ops.push_back(cast<Constant>(Mapped));
     }
   }
   Type *NewSrcTy = nullptr;
   if (TypeMapper)
     if (auto *GEPO = dyn_cast<GEPOperator>(C))
       NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType());
 
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     return getVM()[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
   if (isa<ConstantArray>(C))
     return getVM()[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
   if (isa<ConstantStruct>(C))
     return getVM()[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
   if (isa<ConstantVector>(C))
     return getVM()[V] = ConstantVector::get(Ops);
   // If this is a no-operand constant, it must be because the type was remapped.
   if (isa<UndefValue>(C))
     return getVM()[V] = UndefValue::get(NewTy);
   if (isa<ConstantAggregateZero>(C))
     return getVM()[V] = ConstantAggregateZero::get(NewTy);
   assert(isa<ConstantPointerNull>(C));
   return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
 }
 
 Value *Mapper::mapBlockAddress(const BlockAddress &BA) {
   Function *F = cast<Function>(mapValue(BA.getFunction()));
 
   // F may not have materialized its initializer.  In that case, create a
   // dummy basic block for now, and replace it once we've materialized all
   // the initializers.
   BasicBlock *BB;
   if (F->empty()) {
     DelayedBBs.push_back(DelayedBasicBlock(BA));
     BB = DelayedBBs.back().TempBB.get();
   } else {
     BB = cast_or_null<BasicBlock>(mapValue(BA.getBasicBlock()));
   }
 
   return getVM()[&BA] = BlockAddress::get(F, BB ? BB : BA.getBasicBlock());
 }
 
 Metadata *Mapper::mapToMetadata(const Metadata *Key, Metadata *Val) {
   getVM().MD()[Key].reset(Val);
   return Val;
 }
 
 Metadata *Mapper::mapToSelf(const Metadata *MD) {
   return mapToMetadata(MD, const_cast<Metadata *>(MD));
 }
 
 Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
   if (!Op)
     return nullptr;
 
   if (Optional<Metadata *> MappedOp = M.mapSimpleMetadata(Op)) {
 #ifndef NDEBUG
     if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
       assert((!*MappedOp || M.getVM().count(CMD->getValue()) ||
               M.getVM().getMappedMD(Op)) &&
              "Expected Value to be memoized");
     else
       assert((isa<MDString>(Op) || M.getVM().getMappedMD(Op)) &&
              "Expected result to be memoized");
 #endif
     return *MappedOp;
   }
 
   const MDNode &N = *cast<MDNode>(Op);
   if (N.isDistinct())
     return mapDistinctNode(N);
   return None;
 }
 
+static Metadata *cloneOrBuildODR(const MDNode &N) {
+  auto *CT = dyn_cast<DICompositeType>(&N);
+  // If ODR type uniquing is enabled, we would have uniqued composite types
+  // with identifiers during bitcode reading, so we can just use CT.
+  if (CT && CT->getContext().isODRUniquingDebugTypes() &&
+      CT->getIdentifier() != "")
+    return const_cast<DICompositeType *>(CT);
+  return MDNode::replaceWithDistinct(N.clone());
+}
+
 MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
   assert(N.isDistinct() && "Expected a distinct node");
   assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
-  DistinctWorklist.push_back(cast<MDNode>(
-      (M.Flags & RF_MoveDistinctMDs)
-          ? M.mapToSelf(&N)
-          : M.mapToMetadata(&N, MDNode::replaceWithDistinct(N.clone()))));
+  DistinctWorklist.push_back(
+      cast<MDNode>((M.Flags & RF_MoveDistinctMDs)
+                       ? M.mapToSelf(&N)
+                       : M.mapToMetadata(&N, cloneOrBuildODR(N))));
   return DistinctWorklist.back();
 }
 
 static ConstantAsMetadata *wrapConstantAsMetadata(const ConstantAsMetadata &CMD,
                                                   Value *MappedV) {
   if (CMD.getValue() == MappedV)
     return const_cast<ConstantAsMetadata *>(&CMD);
   return MappedV ? ConstantAsMetadata::getConstant(MappedV) : nullptr;
 }
 
 Optional<Metadata *> MDNodeMapper::getMappedOp(const Metadata *Op) const {
   if (!Op)
     return nullptr;
 
   if (Optional<Metadata *> MappedOp = M.getVM().getMappedMD(Op))
     return *MappedOp;
 
   if (isa<MDString>(Op))
     return const_cast<Metadata *>(Op);
 
   if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
     return wrapConstantAsMetadata(*CMD, M.getVM().lookup(CMD->getValue()));
 
   return None;
 }
 
 Metadata &MDNodeMapper::UniquedGraph::getFwdReference(MDNode &Op) {
   auto Where = Info.find(&Op);
   assert(Where != Info.end() && "Expected a valid reference");
 
   auto &OpD = Where->second;
   if (!OpD.HasChanged)
     return Op;
 
   // Lazily construct a temporary node.
   if (!OpD.Placeholder)
     OpD.Placeholder = Op.clone();
 
   return *OpD.Placeholder;
 }
 
 template <class OperandMapper>
 void MDNodeMapper::remapOperands(MDNode &N, OperandMapper mapOperand) {
   assert(!N.isUniqued() && "Expected distinct or temporary nodes");
   for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
     Metadata *Old = N.getOperand(I);
     Metadata *New = mapOperand(Old);
 
     if (Old != New)
       N.replaceOperandWith(I, New);
   }
 }
 
 namespace {
 
 /// An entry in the worklist for the post-order traversal.
 struct POTWorklistEntry {
   MDNode *N;              ///< Current node.
   MDNode::op_iterator Op; ///< Current operand of \c N.
 
   /// Keep a flag of whether operands have changed in the worklist to avoid
   /// hitting the map in \a UniquedGraph.
   bool HasChanged = false;
 
   POTWorklistEntry(MDNode &N) : N(&N), Op(N.op_begin()) {}
 };
 
 } // end anonymous namespace
 
 bool MDNodeMapper::createPOT(UniquedGraph &G, const MDNode &FirstN) {
   assert(G.Info.empty() && "Expected a fresh traversal");
   assert(FirstN.isUniqued() && "Expected uniqued node in POT");
 
   // Construct a post-order traversal of the uniqued subgraph under FirstN.
   bool AnyChanges = false;
   SmallVector<POTWorklistEntry, 16> Worklist;
   Worklist.push_back(POTWorklistEntry(const_cast<MDNode &>(FirstN)));
   (void)G.Info[&FirstN];
   while (!Worklist.empty()) {
     // Start or continue the traversal through the this node's operands.
     auto &WE = Worklist.back();
     if (MDNode *N = visitOperands(G, WE.Op, WE.N->op_end(), WE.HasChanged)) {
       // Push a new node to traverse first.
       Worklist.push_back(POTWorklistEntry(*N));
       continue;
     }
 
     // Push the node onto the POT.
     assert(WE.N->isUniqued() && "Expected only uniqued nodes");
     assert(WE.Op == WE.N->op_end() && "Expected to visit all operands");
     auto &D = G.Info[WE.N];
     AnyChanges |= D.HasChanged = WE.HasChanged;
     D.ID = G.POT.size();
     G.POT.push_back(WE.N);
 
     // Pop the node off the worklist.
     Worklist.pop_back();
   }
   return AnyChanges;
 }
 
 MDNode *MDNodeMapper::visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
                                     MDNode::op_iterator E, bool &HasChanged) {
   while (I != E) {
     Metadata *Op = *I++; // Increment even on early return.
     if (Optional<Metadata *> MappedOp = tryToMapOperand(Op)) {
       // Check if the operand changes.
       HasChanged |= Op != *MappedOp;
       continue;
     }
 
     // A uniqued metadata node.
     MDNode &OpN = *cast<MDNode>(Op);
     assert(OpN.isUniqued() &&
            "Only uniqued operands cannot be mapped immediately");
     if (G.Info.insert(std::make_pair(&OpN, Data())).second)
       return &OpN; // This is a new one.  Return it.
   }
   return nullptr;
 }
 
 void MDNodeMapper::UniquedGraph::propagateChanges() {
   bool AnyChanges;
   do {
     AnyChanges = false;
     for (MDNode *N : POT) {
       auto &D = Info[N];
       if (D.HasChanged)
         continue;
 
       if (llvm::none_of(N->operands(), [&](const Metadata *Op) {
             auto Where = Info.find(Op);
             return Where != Info.end() && Where->second.HasChanged;
           }))
         continue;
 
       AnyChanges = D.HasChanged = true;
     }
   } while (AnyChanges);
 }
 
 void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
   // Construct uniqued nodes, building forward references as necessary.
   SmallVector<MDNode *, 16> CyclicNodes;
   for (auto *N : G.POT) {
     auto &D = G.Info[N];
     if (!D.HasChanged) {
       // The node hasn't changed.
       M.mapToSelf(N);
       continue;
     }
 
     // Remember whether this node had a placeholder.
     bool HadPlaceholder(D.Placeholder);
 
     // Clone the uniqued node and remap the operands.
     TempMDNode ClonedN = D.Placeholder ? std::move(D.Placeholder) : N->clone();
     remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
       if (Optional<Metadata *> MappedOp = getMappedOp(Old))
         return *MappedOp;
       (void)D;
       assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
       return &G.getFwdReference(*cast<MDNode>(Old));
     });
 
     auto *NewN = MDNode::replaceWithUniqued(std::move(ClonedN));
     M.mapToMetadata(N, NewN);
 
     // Nodes that were referenced out of order in the POT are involved in a
     // uniquing cycle.
     if (HadPlaceholder)
       CyclicNodes.push_back(NewN);
   }
 
   // Resolve cycles.
   for (auto *N : CyclicNodes)
     if (!N->isResolved())
       N->resolveCycles();
 }
 
 Metadata *MDNodeMapper::map(const MDNode &N) {
   assert(DistinctWorklist.empty() && "MDNodeMapper::map is not recursive");
   assert(!(M.Flags & RF_NoModuleLevelChanges) &&
          "MDNodeMapper::map assumes module-level changes");
 
   // Require resolved nodes whenever metadata might be remapped.
   assert(N.isResolved() && "Unexpected unresolved node");
 
   Metadata *MappedN =
       N.isUniqued() ? mapTopLevelUniquedNode(N) : mapDistinctNode(N);
   while (!DistinctWorklist.empty())
     remapOperands(*DistinctWorklist.pop_back_val(), [this](Metadata *Old) {
       if (Optional<Metadata *> MappedOp = tryToMapOperand(Old))
         return *MappedOp;
       return mapTopLevelUniquedNode(*cast<MDNode>(Old));
     });
   return MappedN;
 }
 
 Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
   assert(FirstN.isUniqued() && "Expected uniqued node");
 
   // Create a post-order traversal of uniqued nodes under FirstN.
   UniquedGraph G;
   if (!createPOT(G, FirstN)) {
     // Return early if no nodes have changed.
     for (const MDNode *N : G.POT)
       M.mapToSelf(N);
     return &const_cast<MDNode &>(FirstN);
   }
 
   // Update graph with all nodes that have changed.
   G.propagateChanges();
 
   // Map all the nodes in the graph.
   mapNodesInPOT(G);
 
   // Return the original node, remapped.
   return *getMappedOp(&FirstN);
 }
 
 namespace {
 
 struct MapMetadataDisabler {
   ValueToValueMapTy &VM;
 
   MapMetadataDisabler(ValueToValueMapTy &VM) : VM(VM) {
     VM.disableMapMetadata();
   }
 
   ~MapMetadataDisabler() { VM.enableMapMetadata(); }
 };
 
 } // end anonymous namespace
 
 Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
   // If the value already exists in the map, use it.
   if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
     return *NewMD;
 
   if (isa<MDString>(MD))
     return const_cast<Metadata *>(MD);
 
   // This is a module-level metadata.  If nothing at the module level is
   // changing, use an identity mapping.
   if ((Flags & RF_NoModuleLevelChanges))
     return const_cast<Metadata *>(MD);
 
   if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
     // Disallow recursion into metadata mapping through mapValue.
     MapMetadataDisabler MMD(getVM());
 
     // Don't memoize ConstantAsMetadata.  Instead of lasting until the
     // LLVMContext is destroyed, they can be deleted when the GlobalValue they
     // reference is destructed.  These aren't super common, so the extra
     // indirection isn't that expensive.
     return wrapConstantAsMetadata(*CMD, mapValue(CMD->getValue()));
   }
 
   assert(isa<MDNode>(MD) && "Expected a metadata node");
 
   return None;
 }
 
 Metadata *Mapper::mapMetadata(const Metadata *MD) {
   assert(MD && "Expected valid metadata");
   assert(!isa<LocalAsMetadata>(MD) && "Unexpected local metadata");
 
   if (Optional<Metadata *> NewMD = mapSimpleMetadata(MD))
     return *NewMD;
 
   return MDNodeMapper(*this).map(*cast<MDNode>(MD));
 }
 
 void Mapper::flush() {
   // Flush out the worklist of global values.
   while (!Worklist.empty()) {
     WorklistEntry E = Worklist.pop_back_val();
     CurrentMCID = E.MCID;
     switch (E.Kind) {
     case WorklistEntry::MapGlobalInit:
       E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init));
       remapGlobalObjectMetadata(*E.Data.GVInit.GV);
       break;
     case WorklistEntry::MapAppendingVar: {
       unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
       mapAppendingVariable(*E.Data.AppendingGV.GV,
                            E.Data.AppendingGV.InitPrefix,
                            E.AppendingGVIsOldCtorDtor,
                            makeArrayRef(AppendingInits).slice(PrefixSize));
       AppendingInits.resize(PrefixSize);
       break;
     }
     case WorklistEntry::MapGlobalAliasee:
       E.Data.GlobalAliasee.GA->setAliasee(
           mapConstant(E.Data.GlobalAliasee.Aliasee));
       break;
     case WorklistEntry::RemapFunction:
       remapFunction(*E.Data.RemapF);
       break;
     }
   }
   CurrentMCID = 0;
 
   // Finish logic for block addresses now that all global values have been
   // handled.
   while (!DelayedBBs.empty()) {
     DelayedBasicBlock DBB = DelayedBBs.pop_back_val();
     BasicBlock *BB = cast_or_null<BasicBlock>(mapValue(DBB.OldBB));
     DBB.TempBB->replaceAllUsesWith(BB ? BB : DBB.OldBB);
   }
 }
 
 void Mapper::remapInstruction(Instruction *I) {
   // Remap operands.
   for (Use &Op : I->operands()) {
     Value *V = mapValue(Op);
     // If we aren't ignoring missing entries, assert that something happened.
     if (V)
       Op = V;
     else
       assert((Flags & RF_IgnoreMissingLocals) &&
              "Referenced value not in value map!");
   }
 
   // Remap phi nodes' incoming blocks.
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       Value *V = mapValue(PN->getIncomingBlock(i));
       // If we aren't ignoring missing entries, assert that something happened.
       if (V)
         PN->setIncomingBlock(i, cast<BasicBlock>(V));
       else
         assert((Flags & RF_IgnoreMissingLocals) &&
                "Referenced block not in value map!");
     }
   }
 
   // Remap attached metadata.
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
   I->getAllMetadata(MDs);
   for (const auto &MI : MDs) {
     MDNode *Old = MI.second;
     MDNode *New = cast_or_null<MDNode>(mapMetadata(Old));
     if (New != Old)
       I->setMetadata(MI.first, New);
   }
 
   if (!TypeMapper)
     return;
 
   // If the instruction's type is being remapped, do so now.
   if (auto CS = CallSite(I)) {
     SmallVector<Type *, 3> Tys;
     FunctionType *FTy = CS.getFunctionType();
     Tys.reserve(FTy->getNumParams());
     for (Type *Ty : FTy->params())
       Tys.push_back(TypeMapper->remapType(Ty));
     CS.mutateFunctionType(FunctionType::get(
         TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg()));
     return;
   }
   if (auto *AI = dyn_cast<AllocaInst>(I))
     AI->setAllocatedType(TypeMapper->remapType(AI->getAllocatedType()));
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
     GEP->setSourceElementType(
         TypeMapper->remapType(GEP->getSourceElementType()));
     GEP->setResultElementType(
         TypeMapper->remapType(GEP->getResultElementType()));
   }
   I->mutateType(TypeMapper->remapType(I->getType()));
 }
 
 void Mapper::remapGlobalObjectMetadata(GlobalObject &GO) {
   SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
   GO.getAllMetadata(MDs);
   GO.clearMetadata();
   for (const auto &I : MDs)
     GO.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
 }
 
 void Mapper::remapFunction(Function &F) {
   // Remap the operands.
   for (Use &Op : F.operands())
     if (Op)
       Op = mapValue(Op);
 
   // Remap the metadata attachments.
   remapGlobalObjectMetadata(F);
 
   // Remap the argument types.
   if (TypeMapper)
     for (Argument &A : F.args())
       A.mutateType(TypeMapper->remapType(A.getType()));
 
   // Remap the instructions.
   for (BasicBlock &BB : F)
     for (Instruction &I : BB)
       remapInstruction(&I);
 }
 
 void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
                                   bool IsOldCtorDtor,
                                   ArrayRef<Constant *> NewMembers) {
   SmallVector<Constant *, 16> Elements;
   if (InitPrefix) {
     unsigned NumElements =
         cast<ArrayType>(InitPrefix->getType())->getNumElements();
     for (unsigned I = 0; I != NumElements; ++I)
       Elements.push_back(InitPrefix->getAggregateElement(I));
   }
 
   PointerType *VoidPtrTy;
   Type *EltTy;
   if (IsOldCtorDtor) {
     // FIXME: This upgrade is done during linking to support the C API.  See
     // also IRLinker::linkAppendingVarProto() in IRMover.cpp.
     VoidPtrTy = Type::getInt8Ty(GV.getContext())->getPointerTo();
     auto &ST = *cast<StructType>(NewMembers.front()->getType());
     Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
     EltTy = StructType::get(GV.getContext(), Tys, false);
   }
 
   for (auto *V : NewMembers) {
     Constant *NewV;
     if (IsOldCtorDtor) {
       auto *S = cast<ConstantStruct>(V);
       auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
       auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
       Constant *Null = Constant::getNullValue(VoidPtrTy);
       NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
     } else {
       NewV = cast_or_null<Constant>(mapValue(V));
     }
     Elements.push_back(NewV);
   }
 
   GV.setInitializer(ConstantArray::get(
       cast<ArrayType>(GV.getType()->getElementType()), Elements));
 }
 
 void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
                                           unsigned MCID) {
   assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
   assert(MCID < MCs.size() && "Invalid mapping context");
 
   WorklistEntry WE;
   WE.Kind = WorklistEntry::MapGlobalInit;
   WE.MCID = MCID;
   WE.Data.GVInit.GV = &GV;
   WE.Data.GVInit.Init = &Init;
   Worklist.push_back(WE);
 }
 
 void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
                                           Constant *InitPrefix,
                                           bool IsOldCtorDtor,
                                           ArrayRef<Constant *> NewMembers,
                                           unsigned MCID) {
   assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
   assert(MCID < MCs.size() && "Invalid mapping context");
 
   WorklistEntry WE;
   WE.Kind = WorklistEntry::MapAppendingVar;
   WE.MCID = MCID;
   WE.Data.AppendingGV.GV = &GV;
   WE.Data.AppendingGV.InitPrefix = InitPrefix;
   WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor;
   WE.AppendingGVNumNewMembers = NewMembers.size();
   Worklist.push_back(WE);
   AppendingInits.append(NewMembers.begin(), NewMembers.end());
 }
 
 void Mapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
                                       unsigned MCID) {
   assert(AlreadyScheduled.insert(&GA).second && "Should not reschedule");
   assert(MCID < MCs.size() && "Invalid mapping context");
 
   WorklistEntry WE;
   WE.Kind = WorklistEntry::MapGlobalAliasee;
   WE.MCID = MCID;
   WE.Data.GlobalAliasee.GA = &GA;
   WE.Data.GlobalAliasee.Aliasee = &Aliasee;
   Worklist.push_back(WE);
 }
 
 void Mapper::scheduleRemapFunction(Function &F, unsigned MCID) {
   assert(AlreadyScheduled.insert(&F).second && "Should not reschedule");
   assert(MCID < MCs.size() && "Invalid mapping context");
 
   WorklistEntry WE;
   WE.Kind = WorklistEntry::RemapFunction;
   WE.MCID = MCID;
   WE.Data.RemapF = &F;
   Worklist.push_back(WE);
 }
 
 void Mapper::addFlags(RemapFlags Flags) {
   assert(!hasWorkToDo() && "Expected to have flushed the worklist");
   this->Flags = this->Flags | Flags;
 }
 
 static Mapper *getAsMapper(void *pImpl) {
   return reinterpret_cast<Mapper *>(pImpl);
 }
 
 namespace {
 
 class FlushingMapper {
   Mapper &M;
 
 public:
   explicit FlushingMapper(void *pImpl) : M(*getAsMapper(pImpl)) {
     assert(!M.hasWorkToDo() && "Expected to be flushed");
   }
 
   ~FlushingMapper() { M.flush(); }
 
   Mapper *operator->() const { return &M; }
 };
 
 } // end anonymous namespace
 
 ValueMapper::ValueMapper(ValueToValueMapTy &VM, RemapFlags Flags,
                          ValueMapTypeRemapper *TypeMapper,
                          ValueMaterializer *Materializer)
     : pImpl(new Mapper(VM, Flags, TypeMapper, Materializer)) {}
 
 ValueMapper::~ValueMapper() { delete getAsMapper(pImpl); }
 
 unsigned
 ValueMapper::registerAlternateMappingContext(ValueToValueMapTy &VM,
                                              ValueMaterializer *Materializer) {
   return getAsMapper(pImpl)->registerAlternateMappingContext(VM, Materializer);
 }
 
 void ValueMapper::addFlags(RemapFlags Flags) {
   FlushingMapper(pImpl)->addFlags(Flags);
 }
 
 Value *ValueMapper::mapValue(const Value &V) {
   return FlushingMapper(pImpl)->mapValue(&V);
 }
 
 Constant *ValueMapper::mapConstant(const Constant &C) {
   return cast_or_null<Constant>(mapValue(C));
 }
 
 Metadata *ValueMapper::mapMetadata(const Metadata &MD) {
   return FlushingMapper(pImpl)->mapMetadata(&MD);
 }
 
 MDNode *ValueMapper::mapMDNode(const MDNode &N) {
   return cast_or_null<MDNode>(mapMetadata(N));
 }
 
 void ValueMapper::remapInstruction(Instruction &I) {
   FlushingMapper(pImpl)->remapInstruction(&I);
 }
 
 void ValueMapper::remapFunction(Function &F) {
   FlushingMapper(pImpl)->remapFunction(F);
 }
 
 void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV,
                                                Constant &Init,
                                                unsigned MCID) {
   getAsMapper(pImpl)->scheduleMapGlobalInitializer(GV, Init, MCID);
 }
 
 void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
                                                Constant *InitPrefix,
                                                bool IsOldCtorDtor,
                                                ArrayRef<Constant *> NewMembers,
                                                unsigned MCID) {
   getAsMapper(pImpl)->scheduleMapAppendingVariable(
       GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
 }
 
 void ValueMapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee,
                                            unsigned MCID) {
   getAsMapper(pImpl)->scheduleMapGlobalAliasee(GA, Aliasee, MCID);
 }
 
 void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
   getAsMapper(pImpl)->scheduleRemapFunction(F, MCID);
 }
Index: head/contrib/llvm/tools/clang/include/clang/Basic/DiagnosticDriverKinds.td
===================================================================
--- head/contrib/llvm/tools/clang/include/clang/Basic/DiagnosticDriverKinds.td	(revision 328752)
+++ head/contrib/llvm/tools/clang/include/clang/Basic/DiagnosticDriverKinds.td	(revision 328753)
@@ -1,357 +1,365 @@
 //==--- DiagnosticDriverKinds.td - libdriver diagnostics ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 let Component = "Driver" in {
 
 def err_drv_no_such_file : Error<"no such file or directory: '%0'">;
 def err_drv_unsupported_opt : Error<"unsupported option '%0'">;
 def err_drv_unsupported_opt_for_target : Error<
   "unsupported option '%0' for target '%1'">;
 def err_drv_unsupported_option_argument : Error<
   "unsupported argument '%1' to option '%0'">;
 def err_drv_unknown_stdin_type : Error<
   "-E or -x required when input is from standard input">;
 def err_drv_unknown_stdin_type_clang_cl : Error<
   "use /Tc or /Tp to set input type for standard input">;
 def err_drv_unknown_language : Error<"language not recognized: '%0'">;
 def err_drv_invalid_arch_name : Error<
   "invalid arch name '%0'">;
 def err_drv_cuda_bad_gpu_arch : Error<"Unsupported CUDA gpu architecture: %0">;
 def err_drv_no_cuda_installation : Error<
   "cannot find CUDA installation.  Provide its path via --cuda-path, or pass "
   "-nocudainc to build without CUDA includes.">;
 def err_drv_no_cuda_libdevice : Error<
   "cannot find libdevice for %0. Provide path to different CUDA installation "
   "via --cuda-path, or pass -nocudalib to build without linking with libdevice.">;
 def err_drv_cuda_version_unsupported : Error<
   "GPU arch %0 is supported by CUDA versions between %1 and %2 (inclusive), "
   "but installation at %3 is %4.  Use --cuda-path to specify a different CUDA "
   "install, pass a different GPU arch with --cuda-gpu-arch, or pass "
   "--no-cuda-version-check.">;
 def err_drv_cuda_nvptx_host : Error<"unsupported use of NVPTX for host compilation.">;
 def err_drv_invalid_thread_model_for_target : Error<
   "invalid thread model '%0' in '%1' for this target">;
 def err_drv_invalid_linker_name : Error<
   "invalid linker name in argument '%0'">;
 def err_drv_invalid_pgo_instrumentor : Error<
   "invalid PGO instrumentor in argument '%0'">;
 def err_drv_invalid_rtlib_name : Error<
   "invalid runtime library name in argument '%0'">;
 def err_drv_unsupported_rtlib_for_platform : Error<
   "unsupported runtime library '%0' for platform '%1'">;
 def err_drv_invalid_stdlib_name : Error<
   "invalid library name in argument '%0'">;
 def err_drv_invalid_output_with_multiple_archs : Error<
   "cannot use '%0' output with multiple -arch options">;
 def err_drv_no_input_files : Error<"no input files">;
 def err_drv_use_of_Z_option : Error<
   "unsupported use of internal gcc -Z option '%0'">;
 def err_drv_output_argument_with_multiple_files : Error<
   "cannot specify -o when generating multiple output files">;
 def err_drv_out_file_argument_with_multiple_sources : Error<
   "cannot specify '%0%1' when compiling multiple source files">;
 def err_no_external_assembler : Error<
   "there is no external assembler that can be used on this platform">;
 def err_drv_unable_to_remove_file : Error<
   "unable to remove file: %0">;
 def err_drv_command_failure : Error<
   "unable to execute command: %0">;
 def err_drv_invalid_darwin_version : Error<
   "invalid Darwin version number: %0">;
 def err_drv_missing_argument : Error<
   "argument to '%0' is missing (expected %1 value%s1)">;
 def err_drv_invalid_Xarch_argument_with_args : Error<
   "invalid Xarch argument: '%0', options requiring arguments are unsupported">;
 def err_drv_invalid_Xarch_argument_isdriver : Error<
   "invalid Xarch argument: '%0', cannot change driver behavior inside Xarch argument">;
 def err_drv_Xopenmp_target_missing_triple : Error<
   "cannot deduce implicit triple value for -Xopenmp-target, specify triple using -Xopenmp-target=<triple>">;
 def err_drv_invalid_Xopenmp_target_with_args : Error<
   "invalid -Xopenmp-target argument: '%0', options requiring arguments are unsupported">;
 def err_drv_argument_only_allowed_with : Error<
   "invalid argument '%0' only allowed with '%1'">;
 def err_drv_argument_not_allowed_with : Error<
   "invalid argument '%0' not allowed with '%1'">;
 def err_drv_invalid_version_number : Error<
   "invalid version number in '%0'">;
 def err_drv_no_linker_llvm_support : Error<
   "'%0': unable to pass LLVM bit-code files to linker">;
 def err_drv_no_ast_support : Error<
   "'%0': unable to use AST files with this tool">;
 def err_drv_no_module_support : Error<
   "'%0': unable to use module files with this tool">;
 def err_drv_clang_unsupported : Error<
   "the clang compiler does not support '%0'">;
 def err_drv_clang_unsupported_opt_cxx_darwin_i386 : Error<
   "the clang compiler does not support '%0' for C++ on Darwin/i386">;
 def err_drv_clang_unsupported_opt_faltivec : Error<
   "the clang compiler does not support '%0', %1">;
 def err_drv_command_failed : Error<
   "%0 command failed with exit code %1 (use -v to see invocation)">;
 def err_drv_compilationdatabase : Error<
   "compilation database '%0' could not be opened: %1">;
 def err_drv_command_signalled : Error<
   "%0 command failed due to signal (use -v to see invocation)">;
 def err_drv_force_crash : Error<
   "failing because %select{environment variable 'FORCE_CLANG_DIAGNOSTICS_CRASH' is set|'-gen-reproducer' is used}0">;
 def err_drv_invalid_mfloat_abi : Error<
   "invalid float ABI '%0'">;
 def err_drv_invalid_mtp : Error<
   "invalid thread pointer reading mode '%0'">;
 def err_drv_missing_arg_mtp : Error<
   "missing argument to '%0'">;
 def err_drv_invalid_libcxx_deployment : Error<
   "invalid deployment target for -stdlib=libc++ (requires %0 or later)">;
 def err_drv_invalid_argument_to_fdebug_prefix_map : Error<
   "invalid argument '%0' to -fdebug-prefix-map">;
 def err_drv_malformed_sanitizer_blacklist : Error<
   "malformed sanitizer blacklist: '%0'">;
 def err_drv_duplicate_config : Error<
   "no more than one option '--config' is allowed">;
 def err_drv_config_file_not_exist : Error<
   "configuration file '%0' does not exist">;
 def err_drv_config_file_not_found : Error<
   "configuration file '%0' cannot be found">;
 def note_drv_config_file_searched_in : Note<
   "was searched for in the directory: %0">;
 def err_drv_cannot_read_config_file : Error<
   "cannot read configuration file '%0'">;
 def err_drv_nested_config_file: Error<
   "option '--config' is not allowed inside configuration file">;
 
 def err_target_unsupported_arch
   : Error<"the target architecture '%0' is not supported by the target '%1'">;
 def err_cpu_unsupported_isa
   : Error<"CPU '%0' does not support '%1' execution mode">;
 def err_arch_unsupported_isa
   : Error<"Architecture '%0' does not support '%1' execution mode">;
 
 def err_drv_I_dash_not_supported : Error<
   "'%0' not supported, please use -iquote instead">;
 def err_drv_unknown_argument : Error<"unknown argument: '%0'">;
 def warn_drv_unknown_argument_clang_cl : Warning<
   "unknown argument ignored in clang-cl: '%0'">,
   InGroup<UnknownArgument>;
 
 def warn_drv_ycyu_no_arg_clang_cl : Warning<
   "support for '%0' without a filename not implemented yet; flag ignored">,
   InGroup<ClangClPch>;
 def warn_drv_ycyu_different_arg_clang_cl : Warning<
   "support for '/Yc' and '/Yu' with different filenames not implemented yet; flags ignored">,
   InGroup<ClangClPch>;
 def warn_drv_ycyu_no_fi_arg_clang_cl : Warning<
   "support for '%0' without a corresponding /FI flag not implemented yet; flag ignored">,
   InGroup<ClangClPch>;
 def warn_drv_yc_multiple_inputs_clang_cl : Warning<
   "support for '/Yc' with more than one source file not implemented yet; flag ignored">,
   InGroup<ClangClPch>;
 
 def err_drv_invalid_value : Error<"invalid value '%1' in '%0'">;
 def err_drv_invalid_int_value : Error<"invalid integral value '%1' in '%0'">;
 def err_drv_invalid_remap_file : Error<
     "invalid option '%0' not of the form <from-file>;<to-file>">;
 def err_drv_invalid_gcc_output_type : Error<
     "invalid output type '%0' for use with gcc tool">;
 def err_drv_cc_print_options_failure : Error<
     "unable to open CC_PRINT_OPTIONS file: %0">;
 def err_drv_lto_without_lld : Error<"LTO requires -fuse-ld=lld">;
 def err_drv_preamble_format : Error<
     "incorrect format for -preamble-bytes=N,END">;
 def warn_invalid_ios_deployment_target : Warning<
   "invalid iOS deployment version '%0', iOS 10 is the maximum deployment "
   "target for 32-bit targets">, InGroup<InvalidIOSDeploymentTarget>,
   DefaultError;
 def err_drv_conflicting_deployment_targets : Error<
   "conflicting deployment targets, both '%0' and '%1' are present in environment">;
 def err_arc_unsupported_on_runtime : Error<
   "-fobjc-arc is not supported on platforms using the legacy runtime">;
 def err_arc_unsupported_on_toolchain : Error< // feel free to generalize this
   "-fobjc-arc is not supported on versions of OS X prior to 10.6">;
 def err_objc_weak_with_gc : Error<
   "-fobjc-weak is not supported in Objective-C garbage collection">;
 def err_objc_weak_unsupported : Error<
   "-fobjc-weak is not supported on the current deployment target">;
 def err_drv_mg_requires_m_or_mm : Error<
   "option '-MG' requires '-M' or '-MM'">;
 def err_drv_unknown_objc_runtime : Error<
   "unknown or ill-formed Objective-C runtime '%0'">;
 def err_drv_emit_llvm_link : Error<
    "-emit-llvm cannot be used when linking">;
 def err_drv_optimization_remark_pattern : Error<
   "%0 in '%1'">;
 def err_drv_no_neon_modifier : Error<"[no]neon is not accepted as modifier, please use [no]simd instead">;
 def err_drv_invalid_omp_target : Error<"OpenMP target is invalid: '%0'">;
 def err_drv_omp_host_ir_file_not_found : Error<
   "The provided host compiler IR file '%0' is required to generate code for OpenMP target regions but cannot be found.">;
 def err_drv_omp_host_target_not_supported : Error<
   "The target '%0' is not a supported OpenMP host target.">;
 def err_drv_expecting_fopenmp_with_fopenmp_targets : Error<
   "The option -fopenmp-targets must be used in conjunction with a -fopenmp option compatible with offloading, please use -fopenmp=libomp or -fopenmp=libiomp5.">;
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, 
   InGroup<OpenMPTarget>;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 
 def warn_O4_is_O3 : Warning<"-O4 is equivalent to -O3">, InGroup<Deprecated>;
 def warn_drv_optimization_value : Warning<"optimization level '%0' is not supported; using '%1%2' instead">,
   InGroup<InvalidCommandLineArgument>;
 def warn_ignored_gcc_optimization : Warning<"optimization flag '%0' is not supported">,
   InGroup<IgnoredOptimizationArgument>;
 def warn_ignored_clang_option : Warning<"the flag '%0' has been deprecated and will be ignored">,
   InGroup<UnusedCommandLineArgument>;
 def warn_drv_unsupported_opt_for_target : Warning<
   "optimization flag '%0' is not supported for target '%1'">,
   InGroup<IgnoredOptimizationArgument>;
 def warn_c_kext : Warning<
   "ignoring -fapple-kext which is valid for C++ and Objective-C++ only">;
 def warn_drv_input_file_unused : Warning<
   "%0: '%1' input unused%select{ when '%3' is present|}2">,
   InGroup<UnusedCommandLineArgument>;
 def warn_drv_input_file_unused_by_cpp : Warning<
   "%0: '%1' input unused in cpp mode">,
   InGroup<UnusedCommandLineArgument>;
 def warn_drv_preprocessed_input_file_unused : Warning<
   "%0: previously preprocessed input%select{ unused when '%2' is present|}1">,
   InGroup<UnusedCommandLineArgument>;
 def warn_drv_unused_argument : Warning<
   "argument unused during compilation: '%0'">,
   InGroup<UnusedCommandLineArgument>;
 def warn_drv_empty_joined_argument : Warning<
   "joined argument expects additional value: '%0'">,
   InGroup<UnusedCommandLineArgument>;
 def warn_drv_diagnostics_hotness_requires_pgo : Warning<
   "argument '%0' requires profile-guided optimization information">,
   InGroup<UnusedCommandLineArgument>;
 def warn_drv_clang_unsupported : Warning<
   "the clang compiler does not support '%0'">;
 def warn_drv_deprecated_arg : Warning<
   "argument '%0' is deprecated, use '%1' instead">, InGroup<Deprecated>;
 def warn_drv_assuming_mfloat_abi_is : Warning<
   "unknown platform, assuming -mfloat-abi=%0">;
 def warn_ignoring_ftabstop_value : Warning<
   "ignoring invalid -ftabstop value '%0', using default value %1">;
 def warn_drv_overriding_flag_option : Warning<
   "overriding '%0' option with '%1'">,
   InGroup<DiagGroup<"overriding-t-option">>;
 def warn_drv_treating_input_as_cxx : Warning<
   "treating '%0' input as '%1' when in C++ mode, this behavior is deprecated">,
   InGroup<Deprecated>;
 def warn_drv_pch_not_first_include : Warning<
   "precompiled header '%0' was ignored because '%1' is not first '-include'">;
 def warn_missing_sysroot : Warning<"no such sysroot directory: '%0'">,
   InGroup<DiagGroup<"missing-sysroot">>;
 def warn_incompatible_sysroot : Warning<"using sysroot for '%0' but targeting '%1'">,
   InGroup<DiagGroup<"incompatible-sysroot">>;
 def warn_debug_compression_unavailable : Warning<"cannot compress debug sections (zlib not installed)">,
   InGroup<DiagGroup<"debug-compression-unavailable">>;
 def warn_drv_enabling_rtti_with_exceptions : Warning<
   "implicitly enabling rtti for exception handling">,
   InGroup<DiagGroup<"rtti-for-exceptions">>;
 def warn_drv_disabling_vptr_no_rtti_default : Warning<
   "implicitly disabling vptr sanitizer because rtti wasn't enabled">,
   InGroup<AutoDisableVptrSanitizer>;
 def warn_drv_object_size_disabled_O0 : Warning<
   "the object size sanitizer has no effect at -O0, but is explicitly enabled: %0">,
   InGroup<InvalidCommandLineArgument>;
 
 def note_drv_command_failed_diag_msg : Note<
   "diagnostic msg: %0">;
 def note_drv_t_option_is_global : Note<
   "The last /TC or /TP option takes precedence over earlier instances">;
 def note_drv_address_sanitizer_debug_runtime : Note<
   "AddressSanitizer doesn't support linking with debug runtime libraries yet">;
 def note_drv_use_standard : Note<"use '%0'"
   "%select{| or '%3'|, '%3', or '%4'|, '%3', '%4', or '%5'}2 "
   "for '%1' standard">;
 
 def err_analyzer_config_no_value : Error<
   "analyzer-config option '%0' has a key but no value">;
 def err_analyzer_config_multiple_values : Error<
   "analyzer-config option '%0' should contain only one '='">;
 
 def err_drv_invalid_hvx_length : Error<
   "-mhvx-length is not supported without a -mhvx/-mhvx= flag">;
 
 def err_drv_modules_validate_once_requires_timestamp : Error<
   "option '-fmodules-validate-once-per-build-session' requires "
   "'-fbuild-session-timestamp=<seconds since Epoch>' or '-fbuild-session-file=<file>'">;
 
 def err_test_module_file_extension_format : Error<
   "-ftest-module-file-extension argument '%0' is not of the required form "
   "'blockname:major:minor:hashed:user info'">;
 
 def warn_drv_invoking_fallback : Warning<"falling back to %0">,
   InGroup<Fallback>;
 
 def warn_slash_u_filename : Warning<"'/U%0' treated as the '/U' option">,
   InGroup<DiagGroup<"slash-u-filename">>;
 def note_use_dashdash : Note<"Use '--' to treat subsequent arguments as filenames">;
 
 def err_drv_ropi_rwpi_incompatible_with_pic : Error<
   "embedded and GOT-based position independence are incompatible">;
 def err_drv_ropi_incompatible_with_cxx : Error<
   "ROPI is not compatible with c++">;
 
 def warn_target_unsupported_nan2008 : Warning<
   "ignoring '-mnan=2008' option because the '%0' architecture does not support it">,
   InGroup<UnsupportedNan>;
 def warn_target_unsupported_nanlegacy : Warning<
   "ignoring '-mnan=legacy' option because the '%0' architecture does not support it">,
   InGroup<UnsupportedNan>;
 def warn_target_unsupported_abslegacy : Warning<
   "ignoring '-mabs=legacy' option because the '%0' architecture does not support it">,
   InGroup<UnsupportedAbs>;
 def warn_target_unsupported_abs2008 : Warning<
   "ignoring '-mabs=2008' option because the '%0' architecture does not support it">,
   InGroup<UnsupportedAbs>;
 def warn_target_unsupported_compact_branches : Warning<
   "ignoring '-mcompact-branches=' option because the '%0' architecture does not"
   " support it">, InGroup<UnsupportedCB>;
 def warn_drv_unsupported_gpopt : Warning<
   "ignoring '-mgpopt' option as it cannot be used with %select{|the implicit"
   " usage of }0-mabicalls">,
   InGroup<UnsupportedGPOpt>;
 def warn_drv_unsupported_longcalls : Warning<
   "ignoring '-mlong-calls' option as it is not currently supported with "
   "%select{|the implicit usage of }0-mabicalls">,
   InGroup<OptionIgnored>;
 def warn_drv_unsupported_abicalls : Warning<
   "ignoring '-mabicalls' option as it cannot be used with "
   "non position-independent code and the N64 ABI">,
   InGroup<OptionIgnored>;
 
 def warn_drv_unable_to_find_directory_expected : Warning<
   "unable to find %0 directory, expected to be in '%1'">,
   InGroup<InvalidOrNonExistentDirectory>, DefaultIgnore;
 
 def warn_drv_ps4_force_pic : Warning<
   "option '%0' was ignored by the PS4 toolchain, using '-fPIC'">,
   InGroup<OptionIgnored>;
 
 def warn_drv_ps4_sdk_dir : Warning<
   "environment variable SCE_ORBIS_SDK_DIR is set, but points to invalid or nonexistent directory '%0'">,
   InGroup<InvalidOrNonExistentDirectory>;
 
 def err_drv_unsupported_linker : Error<"unsupported value '%0' for -linker option">;
 def err_drv_defsym_invalid_format : Error<"defsym must be of the form: sym=value: %0">;
 def err_drv_defsym_invalid_symval : Error<"Value is not an integer: %0">;
 def warn_drv_msvc_not_found : Warning<
   "unable to find a Visual Studio installation; "
   "try running Clang from a developer command prompt">,
   InGroup<DiagGroup<"msvc-not-found">>;
 
 def warn_drv_fine_grained_bitfield_accesses_ignored : Warning<
   "option '-ffine-grained-bitfield-accesses' cannot be enabled together with a sanitizer; flag ignored">,
   InGroup<OptionIgnored>;
 
 def note_drv_verify_prefix_spelling : Note<
   "-verify prefixes must start with a letter and contain only alphanumeric"
   " characters, hyphens, and underscores">;
+
+def warn_drv_experimental_isel_incomplete : Warning<
+  "-fexperimental-isel support for the '%0' architecture is incomplete">,
+  InGroup<ExperimentalISel>;
+
+def warn_drv_experimental_isel_incomplete_opt : Warning<
+  "-fexperimental-isel support is incomplete for this architecture at the current optimization level">,
+  InGroup<ExperimentalISel>;
 }
Index: head/contrib/llvm/tools/clang/include/clang/Basic/DiagnosticGroups.td
===================================================================
--- head/contrib/llvm/tools/clang/include/clang/Basic/DiagnosticGroups.td	(revision 328752)
+++ head/contrib/llvm/tools/clang/include/clang/Basic/DiagnosticGroups.td	(revision 328753)
@@ -1,987 +1,990 @@
 //==--- DiagnosticGroups.td - Diagnostic Group Definitions ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 def ImplicitFunctionDeclare : DiagGroup<"implicit-function-declaration">;
 def ImplicitInt : DiagGroup<"implicit-int">;
 
 // Aggregation warning settings.
 def Implicit : DiagGroup<"implicit", [
     ImplicitFunctionDeclare,
     ImplicitInt
 ]>;
 
 // Empty DiagGroups are recognized by clang but ignored.
 def : DiagGroup<"abi">;
 def AbsoluteValue : DiagGroup<"absolute-value">;
 def AddressOfTemporary : DiagGroup<"address-of-temporary">;
 def : DiagGroup<"aggregate-return">;
 def GNUAlignofExpression : DiagGroup<"gnu-alignof-expression">;
 def AmbigMemberTemplate : DiagGroup<"ambiguous-member-template">;
 def GNUAnonymousStruct : DiagGroup<"gnu-anonymous-struct">;
 def GNUAutoType : DiagGroup<"gnu-auto-type">;
 def ArrayBounds : DiagGroup<"array-bounds">;
 def ArrayBoundsPointerArithmetic : DiagGroup<"array-bounds-pointer-arithmetic">;
 def AutoDisableVptrSanitizer : DiagGroup<"auto-disable-vptr-sanitizer">;
 def Availability : DiagGroup<"availability">;
 def Section : DiagGroup<"section">;
 def AutoImport : DiagGroup<"auto-import">;
 def CXX14BinaryLiteral : DiagGroup<"c++14-binary-literal">;
 def CXXPre14CompatBinaryLiteral : DiagGroup<"c++98-c++11-compat-binary-literal">;
 def GNUBinaryLiteral : DiagGroup<"gnu-binary-literal">;
 def BinaryLiteral : DiagGroup<"binary-literal", [CXX14BinaryLiteral,
                                                  CXXPre14CompatBinaryLiteral,
                                                  GNUBinaryLiteral]>;
 def GNUCompoundLiteralInitializer : DiagGroup<"gnu-compound-literal-initializer">;
 def BitFieldConstantConversion : DiagGroup<"bitfield-constant-conversion">;
 def BitFieldEnumConversion : DiagGroup<"bitfield-enum-conversion">;
 def BitFieldWidth : DiagGroup<"bitfield-width">;
 def CoroutineMissingUnhandledException :
   DiagGroup<"coroutine-missing-unhandled-exception">;
 def Coroutine : DiagGroup<"coroutine", [CoroutineMissingUnhandledException]>;
 def ConstantConversion :
   DiagGroup<"constant-conversion", [ BitFieldConstantConversion ] >;
 def LiteralConversion : DiagGroup<"literal-conversion">;
 def StringConversion : DiagGroup<"string-conversion">;
 def SignConversion : DiagGroup<"sign-conversion">;
 def PointerBoolConversion : DiagGroup<"pointer-bool-conversion">;
 def UndefinedBoolConversion : DiagGroup<"undefined-bool-conversion">;
 def BoolConversion : DiagGroup<"bool-conversion", [PointerBoolConversion,
                                                    UndefinedBoolConversion]>;
 def IntConversion : DiagGroup<"int-conversion">;
 def EnumConversion : DiagGroup<"enum-conversion">;
 
 def FloatOverflowConversion : DiagGroup<"float-overflow-conversion">;
 def FloatZeroConversion : DiagGroup<"float-zero-conversion">;
 def FloatConversion :
   DiagGroup<"float-conversion", [FloatOverflowConversion,
                                  FloatZeroConversion]>;
 
 def DoublePromotion : DiagGroup<"double-promotion">;
 def EnumTooLarge : DiagGroup<"enum-too-large">;
 def UnsupportedNan : DiagGroup<"unsupported-nan">;
 def UnsupportedAbs : DiagGroup<"unsupported-abs">;
 def UnsupportedCB : DiagGroup<"unsupported-cb">;
 def UnsupportedGPOpt : DiagGroup<"unsupported-gpopt">;
 def NonLiteralNullConversion : DiagGroup<"non-literal-null-conversion">;
 def NullConversion : DiagGroup<"null-conversion">;
 def ImplicitConversionFloatingPointToBool :
   DiagGroup<"implicit-conversion-floating-point-to-bool">;
 def ObjCLiteralConversion : DiagGroup<"objc-literal-conversion">;
 def MacroRedefined : DiagGroup<"macro-redefined">;
 def BuiltinMacroRedefined : DiagGroup<"builtin-macro-redefined">;
 def BuiltinRequiresHeader : DiagGroup<"builtin-requires-header">;
 def C99Compat : DiagGroup<"c99-compat">;
 def CXXCompat: DiagGroup<"c++-compat">;
 def ExternCCompat : DiagGroup<"extern-c-compat">;
 def KeywordCompat : DiagGroup<"keyword-compat">;
 def GNUCaseRange : DiagGroup<"gnu-case-range">;
 def CastAlign : DiagGroup<"cast-align">;
 def CastQual : DiagGroup<"cast-qual">;
 def : DiagGroup<"char-align">;
 def Comment : DiagGroup<"comment">;
 def GNUComplexInteger : DiagGroup<"gnu-complex-integer">;
 def GNUConditionalOmittedOperand : DiagGroup<"gnu-conditional-omitted-operand">;
 def ConfigMacros : DiagGroup<"config-macros">;
 def : DiagGroup<"ctor-dtor-privacy">;
 def GNUDesignator : DiagGroup<"gnu-designator">;
 def GNUStringLiteralOperatorTemplate :
   DiagGroup<"gnu-string-literal-operator-template">;
 def UndefinedVarTemplate : DiagGroup<"undefined-var-template">;
 def UndefinedFuncTemplate : DiagGroup<"undefined-func-template">;
 def MissingNoEscape : DiagGroup<"missing-noescape">;
 
 def DeleteIncomplete : DiagGroup<"delete-incomplete">;
 def DeleteNonVirtualDtor : DiagGroup<"delete-non-virtual-dtor">;
 def AbstractFinalClass : DiagGroup<"abstract-final-class">;
 
 def CXX11CompatDeprecatedWritableStr :
   DiagGroup<"c++11-compat-deprecated-writable-strings">;
 
 def DeprecatedAttributes : DiagGroup<"deprecated-attributes">;
 def DeprecatedDeclarations : DiagGroup<"deprecated-declarations">;
 def UnavailableDeclarations : DiagGroup<"unavailable-declarations">;
 def UnguardedAvailabilityNew : DiagGroup<"unguarded-availability-new">;
 def UnguardedAvailability : DiagGroup<"unguarded-availability",
                                       [UnguardedAvailabilityNew]>;
 // partial-availability is an alias of unguarded-availability.
 def : DiagGroup<"partial-availability", [UnguardedAvailability]>;
 def DeprecatedDynamicExceptionSpec
     : DiagGroup<"deprecated-dynamic-exception-spec">;
 def DeprecatedImplementations :DiagGroup<"deprecated-implementations">;
 def DeprecatedIncrementBool : DiagGroup<"deprecated-increment-bool">;
 def DeprecatedRegister : DiagGroup<"deprecated-register">;
 def DeprecatedWritableStr : DiagGroup<"deprecated-writable-strings",
                                       [CXX11CompatDeprecatedWritableStr]>;
 // FIXME: Why is DeprecatedImplementations not in this group?
 def Deprecated : DiagGroup<"deprecated", [DeprecatedAttributes,
                                           DeprecatedDeclarations,
                                           DeprecatedDynamicExceptionSpec,
                                           DeprecatedIncrementBool,
                                           DeprecatedRegister,
                                           DeprecatedWritableStr]>,
                  DiagCategory<"Deprecations">;
 
 def DynamicExceptionSpec
     : DiagGroup<"dynamic-exception-spec", [DeprecatedDynamicExceptionSpec]>;
 
 def LibLTO : DiagGroup<"liblto">;
 def : DiagGroup<"disabled-optimization">;
 def : DiagGroup<"discard-qual">;
 def DivZero : DiagGroup<"division-by-zero">;
 def : DiagGroup<"div-by-zero", [DivZero]>;
 
 def DocumentationHTML : DiagGroup<"documentation-html">;
 def DocumentationUnknownCommand : DiagGroup<"documentation-unknown-command">;
 def DocumentationPedantic : DiagGroup<"documentation-pedantic",
                                       [DocumentationUnknownCommand]>;
 def DocumentationDeprecatedSync : DiagGroup<"documentation-deprecated-sync">;
 def Documentation : DiagGroup<"documentation",
                               [DocumentationHTML,
                                DocumentationDeprecatedSync]>;
 
 def EmptyBody : DiagGroup<"empty-body">;
 def Exceptions : DiagGroup<"exceptions">;
 
 def GNUEmptyInitializer : DiagGroup<"gnu-empty-initializer">;
 def GNUEmptyStruct : DiagGroup<"gnu-empty-struct">;
 def ExtraTokens : DiagGroup<"extra-tokens">;
 def CXX11ExtraSemi : DiagGroup<"c++11-extra-semi">;
 def ExtraSemi : DiagGroup<"extra-semi", [CXX11ExtraSemi]>;
 
 def GNUFlexibleArrayInitializer : DiagGroup<"gnu-flexible-array-initializer">;
 def GNUFlexibleArrayUnionMember : DiagGroup<"gnu-flexible-array-union-member">;
 def GNUFoldingConstant : DiagGroup<"gnu-folding-constant">;
 def FormatExtraArgs : DiagGroup<"format-extra-args">;
 def FormatZeroLength : DiagGroup<"format-zero-length">;
 
 def InvalidIOSDeploymentTarget : DiagGroup<"invalid-ios-deployment-target">;
 
 def CXX17CompatMangling : DiagGroup<"c++17-compat-mangling">;
 def : DiagGroup<"c++1z-compat-mangling", [CXX17CompatMangling]>;
 // Name of this warning in GCC.
 def NoexceptType : DiagGroup<"noexcept-type", [CXX17CompatMangling]>;
 
 // Warnings for C++1y code which is not compatible with prior C++ standards.
 def CXXPre14Compat : DiagGroup<"c++98-c++11-compat">;
 def CXXPre14CompatPedantic : DiagGroup<"c++98-c++11-compat-pedantic",
                                        [CXXPre14Compat,
                                         CXXPre14CompatBinaryLiteral]>;
 def CXXPre17Compat : DiagGroup<"c++98-c++11-c++14-compat">;
 def CXXPre17CompatPedantic : DiagGroup<"c++98-c++11-c++14-compat-pedantic",
                                        [CXXPre17Compat]>;
 def CXXPre2aCompat : DiagGroup<"c++98-c++11-c++14-c++17-compat">;
 def CXXPre2aCompatPedantic : DiagGroup<"c++98-c++11-c++14-c++17-compat-pedantic",
                                        [CXXPre2aCompat]>;
 
 def CXX98CompatBindToTemporaryCopy :
   DiagGroup<"c++98-compat-bind-to-temporary-copy">;
 def CXX98CompatLocalTypeTemplateArgs :
   DiagGroup<"c++98-compat-local-type-template-args">;
 def CXX98CompatUnnamedTypeTemplateArgs :
   DiagGroup<"c++98-compat-unnamed-type-template-args">;
 
 def CXX98Compat : DiagGroup<"c++98-compat",
                             [CXX98CompatLocalTypeTemplateArgs,
                              CXX98CompatUnnamedTypeTemplateArgs,
                              CXXPre14Compat,
                              CXXPre17Compat,
                              CXXPre2aCompat]>;
 // Warnings for C++11 features which are Extensions in C++98 mode.
 def CXX98CompatPedantic : DiagGroup<"c++98-compat-pedantic",
                                     [CXX98Compat,
                                      CXX98CompatBindToTemporaryCopy,
                                      CXXPre14CompatPedantic,
                                      CXXPre17CompatPedantic,
                                      CXXPre2aCompatPedantic]>;
 
 def CXX11Narrowing : DiagGroup<"c++11-narrowing">;
 
 def CXX11WarnOverrideDestructor :
   DiagGroup<"inconsistent-missing-destructor-override">;
 def CXX11WarnOverrideMethod : DiagGroup<"inconsistent-missing-override">;
 
 // Original name of this warning in Clang
 def : DiagGroup<"c++0x-narrowing", [CXX11Narrowing]>;
 
 // Name of this warning in GCC
 def : DiagGroup<"narrowing", [CXX11Narrowing]>;
 
 def CXX11CompatReservedUserDefinedLiteral :
   DiagGroup<"c++11-compat-reserved-user-defined-literal">;
 def ReservedUserDefinedLiteral :
   DiagGroup<"reserved-user-defined-literal",
             [CXX11CompatReservedUserDefinedLiteral]>;
 
 def CXX11Compat : DiagGroup<"c++11-compat",
                             [CXX11Narrowing,
                              CXX11CompatReservedUserDefinedLiteral,
                              CXX11CompatDeprecatedWritableStr,
                              CXXPre14Compat,
                              CXXPre17Compat,
                              CXXPre2aCompat]>;
 def : DiagGroup<"c++0x-compat", [CXX11Compat]>;
 def CXX11CompatPedantic : DiagGroup<"c++11-compat-pedantic",
                                     [CXX11Compat,
                                      CXXPre14CompatPedantic,
                                      CXXPre17CompatPedantic,
                                      CXXPre2aCompatPedantic]>;
 
 def CXX14Compat : DiagGroup<"c++14-compat", [CXXPre17Compat,
                                              CXXPre2aCompat]>;
 def CXX14CompatPedantic : DiagGroup<"c++14-compat-pedantic",
                                     [CXX14Compat,
                                      CXXPre17CompatPedantic,
                                      CXXPre2aCompatPedantic]>;
 
 def CXX17Compat : DiagGroup<"c++17-compat", [DeprecatedRegister,
                                              DeprecatedIncrementBool,
                                              CXX17CompatMangling,
                                              CXXPre2aCompat]>;
 def CXX17CompatPedantic : DiagGroup<"c++17-compat-pedantic",
                                     [CXX17Compat,
                                      CXXPre2aCompatPedantic]>;
 def : DiagGroup<"c++1z-compat", [CXX17Compat]>;
 
 def CXX2aCompat : DiagGroup<"c++2a-compat">;
 def CXX2aCompatPedantic : DiagGroup<"c++2a-compat-pedantic",
                                     [CXX2aCompat]>;
 
 def ExitTimeDestructors : DiagGroup<"exit-time-destructors">;
 def FlexibleArrayExtensions : DiagGroup<"flexible-array-extensions">;
 def FourByteMultiChar : DiagGroup<"four-char-constants">;
 def GlobalConstructors : DiagGroup<"global-constructors">;
 def BitwiseOpParentheses: DiagGroup<"bitwise-op-parentheses">;
 def LogicalOpParentheses: DiagGroup<"logical-op-parentheses">;
 def LogicalNotParentheses: DiagGroup<"logical-not-parentheses">;
 def ShiftOpParentheses: DiagGroup<"shift-op-parentheses">;
 def OverloadedShiftOpParentheses: DiagGroup<"overloaded-shift-op-parentheses">;
 def DanglingElse: DiagGroup<"dangling-else">;
 def DanglingField : DiagGroup<"dangling-field">;
 def DistributedObjectModifiers : DiagGroup<"distributed-object-modifiers">;
 def ExpansionToDefined : DiagGroup<"expansion-to-defined">;
 def FlagEnum : DiagGroup<"flag-enum">;
 def IncrementBool : DiagGroup<"increment-bool", [DeprecatedIncrementBool]>;
 def InfiniteRecursion : DiagGroup<"infinite-recursion">;
 def GNUImaginaryConstant : DiagGroup<"gnu-imaginary-constant">;
 def IgnoredQualifiers : DiagGroup<"ignored-qualifiers">;
 def : DiagGroup<"import">;
 def GNUIncludeNext : DiagGroup<"gnu-include-next">;
 def IncompatibleMSStruct : DiagGroup<"incompatible-ms-struct">;
 def IncompatiblePointerTypesDiscardsQualifiers 
   : DiagGroup<"incompatible-pointer-types-discards-qualifiers">;
 def IncompatibleFunctionPointerTypes
   : DiagGroup<"incompatible-function-pointer-types">;
 def IncompatiblePointerTypes
   : DiagGroup<"incompatible-pointer-types",
     [IncompatiblePointerTypesDiscardsQualifiers,
      IncompatibleFunctionPointerTypes]>;
 def IncompleteUmbrella : DiagGroup<"incomplete-umbrella">;
 def NonModularIncludeInFrameworkModule
   : DiagGroup<"non-modular-include-in-framework-module">;
 def NonModularIncludeInModule : DiagGroup<"non-modular-include-in-module",
                                           [NonModularIncludeInFrameworkModule]>;
 def IncompleteModule : DiagGroup<"incomplete-module",
     [IncompleteUmbrella, NonModularIncludeInModule]>;
 def PrivateModule : DiagGroup<"private-module">;
 
 def CXX11InlineNamespace : DiagGroup<"c++11-inline-namespace">;
 def InvalidNoreturn : DiagGroup<"invalid-noreturn">;
 def InvalidSourceEncoding : DiagGroup<"invalid-source-encoding">;
 def KNRPromotedParameter : DiagGroup<"knr-promoted-parameter">;
 def : DiagGroup<"init-self">;
 def : DiagGroup<"inline">;
 def : DiagGroup<"invalid-pch">;
 def GNULabelsAsValue : DiagGroup<"gnu-label-as-value">;
 def LiteralRange : DiagGroup<"literal-range">;
 def LocalTypeTemplateArgs : DiagGroup<"local-type-template-args",
                                       [CXX98CompatLocalTypeTemplateArgs]>;
 def RangeLoopAnalysis : DiagGroup<"range-loop-analysis">;
 def ForLoopAnalysis : DiagGroup<"for-loop-analysis">;
 def LoopAnalysis : DiagGroup<"loop-analysis", [ForLoopAnalysis,
                                                RangeLoopAnalysis]>;
 def MalformedWarningCheck : DiagGroup<"malformed-warning-check">;
 def Main : DiagGroup<"main">;
 def MainReturnType : DiagGroup<"main-return-type">;
 def MaxUnsignedZero : DiagGroup<"max-unsigned-zero">;
 def MissingBraces : DiagGroup<"missing-braces">;
 def MissingDeclarations: DiagGroup<"missing-declarations">;
 def : DiagGroup<"missing-format-attribute">;
 def : DiagGroup<"missing-include-dirs">;
 def MissingNoreturn : DiagGroup<"missing-noreturn">;
 def MultiChar : DiagGroup<"multichar">;
 def : DiagGroup<"nested-externs">;
 def CXX11LongLong : DiagGroup<"c++11-long-long">;
 def LongLong : DiagGroup<"long-long", [CXX11LongLong]>;
 def ImplicitlyUnsignedLiteral : DiagGroup<"implicitly-unsigned-literal">;
 def MethodSignatures : DiagGroup<"method-signatures">;
 def MismatchedParameterTypes : DiagGroup<"mismatched-parameter-types">;
 def MismatchedReturnTypes : DiagGroup<"mismatched-return-types">;
 def MismatchedTags : DiagGroup<"mismatched-tags">;
 def MissingFieldInitializers : DiagGroup<"missing-field-initializers">;
 def ModuleBuild : DiagGroup<"module-build">;
 def ModuleConflict : DiagGroup<"module-conflict">;
 def ModuleFileExtension : DiagGroup<"module-file-extension">;
 def NewlineEOF : DiagGroup<"newline-eof">;
 def Nullability : DiagGroup<"nullability">;
 def NullabilityDeclSpec : DiagGroup<"nullability-declspec">;
 def NullabilityInferredOnNestedType : DiagGroup<"nullability-inferred-on-nested-type">;
 def NullableToNonNullConversion : DiagGroup<"nullable-to-nonnull-conversion">;
 def NullabilityCompletenessOnArrays : DiagGroup<"nullability-completeness-on-arrays">;
 def NullabilityCompleteness : DiagGroup<"nullability-completeness",
                                         [NullabilityCompletenessOnArrays]>;
 def NullArithmetic : DiagGroup<"null-arithmetic">;
 def NullCharacter : DiagGroup<"null-character">;
 def NullDereference : DiagGroup<"null-dereference">;
 def InitializerOverrides : DiagGroup<"initializer-overrides">;
 def NonNull : DiagGroup<"nonnull">;
 def NonPODVarargs : DiagGroup<"non-pod-varargs">;
 def ClassVarargs : DiagGroup<"class-varargs", [NonPODVarargs]>;
 def : DiagGroup<"nonportable-cfstrings">;
 def NonVirtualDtor : DiagGroup<"non-virtual-dtor">;
 def NullPointerArithmetic : DiagGroup<"null-pointer-arithmetic">;
 def : DiagGroup<"effc++", [NonVirtualDtor]>;
 def OveralignedType : DiagGroup<"over-aligned">;
 def AlignedAllocationUnavailable : DiagGroup<"aligned-allocation-unavailable">;
 def OldStyleCast : DiagGroup<"old-style-cast">;
 def : DiagGroup<"old-style-definition">;
 def OutOfLineDeclaration : DiagGroup<"out-of-line-declaration">;
 def : DiagGroup<"overflow">;
 def ForwardClassReceiver : DiagGroup<"receiver-forward-class">;
 def MethodAccess : DiagGroup<"objc-method-access">;
 def ObjCReceiver : DiagGroup<"receiver-expr">;
 def OperatorNewReturnsNull : DiagGroup<"new-returns-null">;
 def OverlengthStrings : DiagGroup<"overlength-strings">;
 def OverloadedVirtual : DiagGroup<"overloaded-virtual">;
 def PrivateExtern : DiagGroup<"private-extern">;
 def SelTypeCast : DiagGroup<"cast-of-sel-type">;
 def FunctionDefInObjCContainer : DiagGroup<"function-def-in-objc-container">;
 def BadFunctionCast : DiagGroup<"bad-function-cast">;
 def ObjCPropertyImpl : DiagGroup<"objc-property-implementation">;
 def ObjCPropertyNoAttribute : DiagGroup<"objc-property-no-attribute">;
 def ObjCProtocolQualifiers : DiagGroup<"objc-protocol-qualifiers">;
 def ObjCMissingSuperCalls : DiagGroup<"objc-missing-super-calls">;
 def ObjCDesignatedInit : DiagGroup<"objc-designated-initializers">;
 def ObjCRetainBlockProperty : DiagGroup<"objc-noncopy-retain-block-property">;
 def ObjCReadonlyPropertyHasSetter : DiagGroup<"objc-readonly-with-setter-property">;
 def ObjCInvalidIBOutletProperty : DiagGroup<"invalid-iboutlet">;
 def ObjCRootClass : DiagGroup<"objc-root-class">;
 def ObjCPointerIntrospectPerformSelector : DiagGroup<"deprecated-objc-pointer-introspection-performSelector">;
 def ObjCPointerIntrospect : DiagGroup<"deprecated-objc-pointer-introspection", [ObjCPointerIntrospectPerformSelector]>;
 def ObjCMultipleMethodNames : DiagGroup<"objc-multiple-method-names">;
 def ObjCFlexibleArray : DiagGroup<"objc-flexible-array">;
 def OpenCLUnsupportedRGBA: DiagGroup<"opencl-unsupported-rgba">;
 def DeprecatedObjCIsaUsage : DiagGroup<"deprecated-objc-isa-usage">;
 def ExplicitInitializeCall : DiagGroup<"explicit-initialize-call">;
 def Packed : DiagGroup<"packed">;
 def Padded : DiagGroup<"padded">;
 def PessimizingMove : DiagGroup<"pessimizing-move">;
 def PointerArith : DiagGroup<"pointer-arith">;
 def PoundWarning : DiagGroup<"#warnings">;
 def PoundPragmaMessage : DiagGroup<"#pragma-messages">,
                          DiagCategory<"#pragma message Directive">;
 def : DiagGroup<"pointer-to-int-cast">;
 def : DiagGroup<"redundant-decls">;
 def RedeclaredClassMember : DiagGroup<"redeclared-class-member">;
 def GNURedeclaredEnum : DiagGroup<"gnu-redeclared-enum">;
 def RedundantMove : DiagGroup<"redundant-move">;
 def Register : DiagGroup<"register", [DeprecatedRegister]>;
 def ReturnStackAddress : DiagGroup<"return-stack-address">;
 def ReturnTypeCLinkage : DiagGroup<"return-type-c-linkage">;
 def ReturnType : DiagGroup<"return-type", [ReturnTypeCLinkage]>;
 def BindToTemporaryCopy : DiagGroup<"bind-to-temporary-copy",
                                     [CXX98CompatBindToTemporaryCopy]>;
 def SelfAssignmentField : DiagGroup<"self-assign-field">;
 def SelfAssignment : DiagGroup<"self-assign", [SelfAssignmentField]>;
 def SelfMove : DiagGroup<"self-move">;
 def SemiBeforeMethodBody : DiagGroup<"semicolon-before-method-body">;
 def Sentinel : DiagGroup<"sentinel">;
 def MissingMethodReturnType : DiagGroup<"missing-method-return-type">;
 
 def ShadowField : DiagGroup<"shadow-field">;
 def ShadowFieldInConstructorModified : DiagGroup<"shadow-field-in-constructor-modified">;
 def ShadowFieldInConstructor : DiagGroup<"shadow-field-in-constructor",
                                          [ShadowFieldInConstructorModified]>;
 def ShadowIvar : DiagGroup<"shadow-ivar">;
 def ShadowUncapturedLocal : DiagGroup<"shadow-uncaptured-local">;
 
 // -Wshadow-all is a catch-all for all shadowing. -Wshadow is just the
 // shadowing that we think is unsafe.
 def Shadow : DiagGroup<"shadow", [ShadowFieldInConstructorModified,
                                   ShadowIvar]>;
 def ShadowAll : DiagGroup<"shadow-all", [Shadow, ShadowFieldInConstructor,
                                          ShadowUncapturedLocal, ShadowField]>;
 
 def Shorten64To32 : DiagGroup<"shorten-64-to-32">;
 def : DiagGroup<"sign-promo">;
 def SignCompare : DiagGroup<"sign-compare">;
 def : DiagGroup<"stack-protector">;
 def : DiagGroup<"switch-default">;
 def : DiagGroup<"synth">;
 def SizeofArrayArgument : DiagGroup<"sizeof-array-argument">;
 def SizeofArrayDecay : DiagGroup<"sizeof-array-decay">;
 def SizeofPointerMemaccess : DiagGroup<"sizeof-pointer-memaccess">;
 def StaticInInline : DiagGroup<"static-in-inline">;
 def StaticLocalInInline : DiagGroup<"static-local-in-inline">;
 def GNUStaticFloatInit : DiagGroup<"gnu-static-float-init">;
 def StaticFloatInit : DiagGroup<"static-float-init", [GNUStaticFloatInit]>;
 def GNUStatementExpression : DiagGroup<"gnu-statement-expression">;
 def StringCompare : DiagGroup<"string-compare">;
 def StringPlusInt : DiagGroup<"string-plus-int">;
 def StringPlusChar : DiagGroup<"string-plus-char">;
 def StrncatSize : DiagGroup<"strncat-size">;
 def TautologicalTypeLimitCompare : DiagGroup<"tautological-type-limit-compare">;
 def TautologicalUnsignedZeroCompare : DiagGroup<"tautological-unsigned-zero-compare">;
 def TautologicalUnsignedEnumZeroCompare : DiagGroup<"tautological-unsigned-enum-zero-compare">;
 def TautologicalInRangeCompare : DiagGroup<"tautological-constant-in-range-compare",
                                            [TautologicalTypeLimitCompare,
                                             TautologicalUnsignedZeroCompare,
                                             TautologicalUnsignedEnumZeroCompare]>;
 def TautologicalOutOfRangeCompare : DiagGroup<"tautological-constant-out-of-range-compare">;
 def TautologicalConstantCompare : DiagGroup<"tautological-constant-compare",
                                             [TautologicalOutOfRangeCompare]>;
 def TautologicalPointerCompare : DiagGroup<"tautological-pointer-compare">;
 def TautologicalOverlapCompare : DiagGroup<"tautological-overlap-compare">;
 def TautologicalUndefinedCompare : DiagGroup<"tautological-undefined-compare">;
 def TautologicalCompare : DiagGroup<"tautological-compare",
                                     [TautologicalConstantCompare,
                                      TautologicalPointerCompare,
                                      TautologicalOverlapCompare,
                                      TautologicalUndefinedCompare]>;
 def HeaderHygiene : DiagGroup<"header-hygiene">;
 def DuplicateDeclSpecifier : DiagGroup<"duplicate-decl-specifier">;
 def CompareDistinctPointerType : DiagGroup<"compare-distinct-pointer-types">;
 def GNUUnionCast : DiagGroup<"gnu-union-cast">;
 def GNUVariableSizedTypeNotAtEnd : DiagGroup<"gnu-variable-sized-type-not-at-end">;
 def Varargs : DiagGroup<"varargs">;
 
 def Unsequenced : DiagGroup<"unsequenced">;
 // GCC name for -Wunsequenced
 def : DiagGroup<"sequence-point", [Unsequenced]>;
 
 // Preprocessor warnings.
 def AmbiguousMacro : DiagGroup<"ambiguous-macro">;
 def KeywordAsMacro : DiagGroup<"keyword-macro">;
 def ReservedIdAsMacro : DiagGroup<"reserved-id-macro">;
 
 // Just silence warnings about -Wstrict-aliasing for now.
 def : DiagGroup<"strict-aliasing=0">;
 def : DiagGroup<"strict-aliasing=1">;
 def : DiagGroup<"strict-aliasing=2">;
 def : DiagGroup<"strict-aliasing">;
 
 // Just silence warnings about -Wstrict-overflow for now.
 def : DiagGroup<"strict-overflow=0">;
 def : DiagGroup<"strict-overflow=1">;
 def : DiagGroup<"strict-overflow=2">;
 def : DiagGroup<"strict-overflow=3">;
 def : DiagGroup<"strict-overflow=4">;
 def : DiagGroup<"strict-overflow=5">;
 def : DiagGroup<"strict-overflow">;
 
 def InvalidOffsetof : DiagGroup<"invalid-offsetof">;
 def : DiagGroup<"strict-prototypes">;
 def StrictSelector : DiagGroup<"strict-selector-match">;
 def MethodDuplicate : DiagGroup<"duplicate-method-match">;
 def ObjCCStringFormat : DiagGroup<"cstring-format-directive">;
 def CoveredSwitchDefault : DiagGroup<"covered-switch-default">;
 def SwitchBool     : DiagGroup<"switch-bool">;
 def SwitchEnum     : DiagGroup<"switch-enum">;
 def Switch         : DiagGroup<"switch">;
 def EnumCompareSwitch : DiagGroup<"enum-compare-switch">;
 def EnumCompare       : DiagGroup<"enum-compare", [EnumCompareSwitch]>;
 def ImplicitFallthroughPerFunction :
   DiagGroup<"implicit-fallthrough-per-function">;
 def ImplicitFallthrough  : DiagGroup<"implicit-fallthrough",
                                      [ImplicitFallthroughPerFunction]>;
 def InvalidPPToken : DiagGroup<"invalid-pp-token">;
 def Trigraphs      : DiagGroup<"trigraphs">;
 
 def : DiagGroup<"type-limits">;
 def UndefinedReinterpretCast : DiagGroup<"undefined-reinterpret-cast">;
 def ReinterpretBaseClass : DiagGroup<"reinterpret-base-class">;
 def Unicode  : DiagGroup<"unicode">;
 def UninitializedMaybe : DiagGroup<"conditional-uninitialized">;
 def UninitializedSometimes : DiagGroup<"sometimes-uninitialized">;
 def UninitializedStaticSelfInit : DiagGroup<"static-self-init">;
 def Uninitialized  : DiagGroup<"uninitialized", [UninitializedSometimes,
                                                  UninitializedStaticSelfInit]>;
 def IgnoredPragmaIntrinsic : DiagGroup<"ignored-pragma-intrinsic">;
 def UnknownPragmas : DiagGroup<"unknown-pragmas">;
 def IgnoredPragmas : DiagGroup<"ignored-pragmas", [IgnoredPragmaIntrinsic]>;
 def PragmaClangAttribute : DiagGroup<"pragma-clang-attribute">;
 def PragmaPackSuspiciousInclude : DiagGroup<"pragma-pack-suspicious-include">;
 def PragmaPack : DiagGroup<"pragma-pack", [PragmaPackSuspiciousInclude]>;
 def Pragmas : DiagGroup<"pragmas", [UnknownPragmas, IgnoredPragmas,
                                     PragmaClangAttribute, PragmaPack]>;
 def UnknownWarningOption : DiagGroup<"unknown-warning-option">;
 def NSobjectAttribute : DiagGroup<"NSObject-attribute">;
 def NSConsumedMismatch : DiagGroup<"nsconsumed-mismatch">;
 def NSReturnsMismatch : DiagGroup<"nsreturns-mismatch">;
 
 def IndependentClassAttribute : DiagGroup<"IndependentClass-attribute">;
 def UnknownAttributes : DiagGroup<"unknown-attributes">;
 def IgnoredAttributes : DiagGroup<"ignored-attributes">;
 def Attributes : DiagGroup<"attributes", [UnknownAttributes,
                                           IgnoredAttributes]>;
 def UnknownSanitizers : DiagGroup<"unknown-sanitizers">;
 def UnnamedTypeTemplateArgs : DiagGroup<"unnamed-type-template-args",
                                         [CXX98CompatUnnamedTypeTemplateArgs]>;
 def UnsupportedFriend : DiagGroup<"unsupported-friend">;
 def UnusedArgument : DiagGroup<"unused-argument">;
 def UnusedCommandLineArgument : DiagGroup<"unused-command-line-argument">;
 def IgnoredOptimizationArgument : DiagGroup<"ignored-optimization-argument">;
 def InvalidCommandLineArgument : DiagGroup<"invalid-command-line-argument",
                                            [IgnoredOptimizationArgument]>;
 def UnusedComparison : DiagGroup<"unused-comparison">;
 def UnusedExceptionParameter : DiagGroup<"unused-exception-parameter">;
 def UnneededInternalDecl : DiagGroup<"unneeded-internal-declaration">;
 def UnneededMemberFunction : DiagGroup<"unneeded-member-function">;
 def UnusedPrivateField : DiagGroup<"unused-private-field">;
 def UnusedFunction : DiagGroup<"unused-function", [UnneededInternalDecl]>;
 def UnusedTemplate : DiagGroup<"unused-template", [UnneededInternalDecl]>;
 def UnusedMemberFunction : DiagGroup<"unused-member-function",
                                      [UnneededMemberFunction]>;
 def UnusedLabel : DiagGroup<"unused-label">;
 def UnusedLambdaCapture : DiagGroup<"unused-lambda-capture">;
 def UnusedParameter : DiagGroup<"unused-parameter">;
 def UnusedResult : DiagGroup<"unused-result">;
 def PotentiallyEvaluatedExpression : DiagGroup<"potentially-evaluated-expression">;
 def UnevaluatedExpression : DiagGroup<"unevaluated-expression",
                                       [PotentiallyEvaluatedExpression]>;
 def UnusedValue : DiagGroup<"unused-value", [UnusedComparison, UnusedResult,
                                              UnevaluatedExpression]>;
 def UnusedConstVariable : DiagGroup<"unused-const-variable">;
 def UnusedVariable : DiagGroup<"unused-variable",
                                [UnusedConstVariable]>;
 def UnusedLocalTypedef : DiagGroup<"unused-local-typedef">;
 def UnusedPropertyIvar :  DiagGroup<"unused-property-ivar">;
 def UnusedGetterReturnValue : DiagGroup<"unused-getter-return-value">;
 def UsedButMarkedUnused : DiagGroup<"used-but-marked-unused">;
 def UserDefinedLiterals : DiagGroup<"user-defined-literals">;
 def UserDefinedWarnings : DiagGroup<"user-defined-warnings">;
 def Reorder : DiagGroup<"reorder">;
 def UndeclaredSelector : DiagGroup<"undeclared-selector">;
 def ImplicitAtomic : DiagGroup<"implicit-atomic-properties">;
 def CustomAtomic : DiagGroup<"custom-atomic-properties">;
 def AtomicProperties : DiagGroup<"atomic-properties",
                                  [ImplicitAtomic, CustomAtomic]>;
 def ARCUnsafeRetainedAssign : DiagGroup<"arc-unsafe-retained-assign">;
 def ARCRetainCycles : DiagGroup<"arc-retain-cycles">;
 def ARCNonPodMemAccess : DiagGroup<"arc-non-pod-memaccess">;
 def AutomaticReferenceCounting : DiagGroup<"arc",
                                            [ARCUnsafeRetainedAssign,
                                             ARCRetainCycles,
                                             ARCNonPodMemAccess]>;
 def ARCRepeatedUseOfWeakMaybe : DiagGroup<"arc-maybe-repeated-use-of-weak">;
 def ARCRepeatedUseOfWeak : DiagGroup<"arc-repeated-use-of-weak",
                                      [ARCRepeatedUseOfWeakMaybe]>;
 def BlockCaptureAutoReleasing : DiagGroup<"block-capture-autoreleasing">;
 def ObjCBridge : DiagGroup<"bridge-cast">;
 
 def DeallocInCategory:DiagGroup<"dealloc-in-category">;
 def SelectorTypeMismatch : DiagGroup<"selector-type-mismatch">;
 def Selector : DiagGroup<"selector", [SelectorTypeMismatch]>;
 def Protocol : DiagGroup<"protocol">;
 def AtProtocol : DiagGroup<"at-protocol">;
 def PropertyAccessDotSyntax: DiagGroup<"property-access-dot-syntax">;
 def PropertyAttr : DiagGroup<"property-attribute-mismatch">;
 def SuperSubClassMismatch : DiagGroup<"super-class-method-mismatch">;
 def OverridingMethodMismatch : DiagGroup<"overriding-method-mismatch">;
 def VariadicMacros : DiagGroup<"variadic-macros">;
 def VectorConversion : DiagGroup<"vector-conversion">;      // clang specific
 def VexingParse : DiagGroup<"vexing-parse">;
 def VLA : DiagGroup<"vla">;
 def VLAExtension : DiagGroup<"vla-extension">;
 def VolatileRegisterVar : DiagGroup<"volatile-register-var">;
 def Visibility : DiagGroup<"visibility">;
 def ZeroLengthArray : DiagGroup<"zero-length-array">;
 def GNUZeroLineDirective : DiagGroup<"gnu-zero-line-directive">;
 def GNUZeroVariadicMacroArguments : DiagGroup<"gnu-zero-variadic-macro-arguments">;
 def Fallback : DiagGroup<"fallback">;
 
 // This covers both the deprecated case (in C++98)
 // and the extension case (in C++11 onwards).
 def WritableStrings : DiagGroup<"writable-strings", [DeprecatedWritableStr]>;
 
 // GCC calls -Wdeprecated-writable-strings -Wwrite-strings.
 //
 // Bizarrely, this warning flag enables -fconst-strings in C. This is
 // GCC-compatible, but really weird.
 //
 // FIXME: Should this affect C++11 (where this is an error,
 //        not just deprecated) or not?
 def GCCWriteStrings : DiagGroup<"write-strings" , [WritableStrings]>;
 
 def CharSubscript : DiagGroup<"char-subscripts">;
 def LargeByValueCopy : DiagGroup<"large-by-value-copy">;
 def DuplicateArgDecl : DiagGroup<"duplicate-method-arg">;
 def SignedEnumBitfield : DiagGroup<"signed-enum-bitfield">;
 
 // Unreachable code warning groups.
 //
 //  The goal is make -Wunreachable-code on by default, in -Wall, or at
 //  least actively used, with more noisy versions of the warning covered
 //  under separate flags.
 //
 def UnreachableCodeLoopIncrement : DiagGroup<"unreachable-code-loop-increment">;
 def UnreachableCode : DiagGroup<"unreachable-code",
                                 [UnreachableCodeLoopIncrement]>;
 def UnreachableCodeBreak : DiagGroup<"unreachable-code-break">;
 def UnreachableCodeReturn : DiagGroup<"unreachable-code-return">;
 def UnreachableCodeAggressive : DiagGroup<"unreachable-code-aggressive",
                                     [UnreachableCode,
                                      UnreachableCodeBreak,
                                      UnreachableCodeReturn]>;
 
 // Aggregation warning settings.
 
 // Populate -Waddress with warnings from other groups.
 def : DiagGroup<"address", [PointerBoolConversion,
                             StringCompare,
                             TautologicalPointerCompare]>;
 
 // -Widiomatic-parentheses contains warnings about 'idiomatic'
 // missing parentheses;  it is off by default.  We do not include it
 // in -Wparentheses because most users who use -Wparentheses explicitly
 // do not want these warnings.
 def ParenthesesOnEquality : DiagGroup<"parentheses-equality">;
 def Parentheses : DiagGroup<"parentheses",
                             [LogicalOpParentheses,
                              LogicalNotParentheses,
                              BitwiseOpParentheses,
                              ShiftOpParentheses,
                              OverloadedShiftOpParentheses,
                              ParenthesesOnEquality,
                              DanglingElse]>;
 
 // -Wconversion has its own warnings, but we split a few out for
 // legacy reasons:
 //   - some people want just 64-to-32 warnings
 //   - conversion warnings with constant sources are on by default
 //   - conversion warnings for literals are on by default
 //   - bool-to-pointer conversion warnings are on by default
 //   - __null-to-integer conversion warnings are on by default
 def Conversion : DiagGroup<"conversion",
                            [BoolConversion,
                             ConstantConversion,
                             EnumConversion,
                             BitFieldEnumConversion,
                             FloatConversion,
                             Shorten64To32,
                             IntConversion,
                             LiteralConversion,
                             NonLiteralNullConversion, // (1-1)->pointer (etc)
                             NullConversion, // NULL->non-pointer
                             ObjCLiteralConversion,
                             SignConversion,
                             StringConversion]>,
                  DiagCategory<"Value Conversion Issue">;
 
 def Unused : DiagGroup<"unused",
                        [UnusedArgument, UnusedFunction, UnusedLabel,
                         // UnusedParameter, (matches GCC's behavior)
                         // UnusedTemplate, (clean-up libc++ before enabling)
                         // UnusedMemberFunction, (clean-up llvm before enabling)
                         UnusedPrivateField, UnusedLambdaCapture,
                         UnusedLocalTypedef, UnusedValue, UnusedVariable,
                         UnusedPropertyIvar]>,
                         DiagCategory<"Unused Entity Issue">;
 
 // Format settings.
 def FormatInvalidSpecifier : DiagGroup<"format-invalid-specifier">;
 def FormatSecurity : DiagGroup<"format-security">;
 def FormatNonStandard : DiagGroup<"format-non-iso">;
 def FormatY2K : DiagGroup<"format-y2k">;
 def FormatPedantic : DiagGroup<"format-pedantic">;
 def Format : DiagGroup<"format",
                        [FormatExtraArgs, FormatZeroLength, NonNull,
                         FormatSecurity, FormatY2K, FormatInvalidSpecifier]>,
              DiagCategory<"Format String Issue">;
 def FormatNonLiteral : DiagGroup<"format-nonliteral">;
 def Format2 : DiagGroup<"format=2",
                         [FormatNonLiteral, FormatSecurity, FormatY2K]>;
 
 def TypeSafety : DiagGroup<"type-safety">;
 
 def IncompatibleExceptionSpec : DiagGroup<"incompatible-exception-spec">;
 
 def IntToVoidPointerCast : DiagGroup<"int-to-void-pointer-cast">;
 def IntToPointerCast : DiagGroup<"int-to-pointer-cast",
                                  [IntToVoidPointerCast]>;
 
 def Move : DiagGroup<"move", [PessimizingMove, RedundantMove, SelfMove]>;
 
 def Extra : DiagGroup<"extra", [
     MissingFieldInitializers,
     IgnoredQualifiers,
     InitializerOverrides,
     SemiBeforeMethodBody,
     MissingMethodReturnType,
     SignCompare,
     UnusedParameter,
     NullPointerArithmetic
   ]>;
 
 def Most : DiagGroup<"most", [
     CharSubscript,
     Comment,
     DeleteNonVirtualDtor,
     ForLoopAnalysis,
     Format,
     Implicit,
     InfiniteRecursion,
     MismatchedTags,
     MissingBraces,
     Move,
     MultiChar,
     Reorder,
     ReturnType,
     SelfAssignment,
     SelfMove,
     SizeofArrayArgument,
     SizeofArrayDecay,
     StringPlusInt,
     Trigraphs,
     Uninitialized,
     UnknownPragmas,
     Unused,
     VolatileRegisterVar,
     ObjCMissingSuperCalls,
     ObjCDesignatedInit,
     ObjCFlexibleArray,
     OverloadedVirtual,
     PrivateExtern,
     SelTypeCast,
     ExternCCompat,
     UserDefinedWarnings
  ]>;
 
 // Thread Safety warnings 
 def ThreadSafetyAttributes : DiagGroup<"thread-safety-attributes">;
 def ThreadSafetyAnalysis   : DiagGroup<"thread-safety-analysis">;
 def ThreadSafetyPrecise    : DiagGroup<"thread-safety-precise">;
 def ThreadSafetyReference  : DiagGroup<"thread-safety-reference">;
 def ThreadSafetyNegative   : DiagGroup<"thread-safety-negative">;
 def ThreadSafety : DiagGroup<"thread-safety",
                              [ThreadSafetyAttributes, 
                               ThreadSafetyAnalysis,
                               ThreadSafetyPrecise,
                               ThreadSafetyReference]>;
 def ThreadSafetyVerbose : DiagGroup<"thread-safety-verbose">;
 def ThreadSafetyBeta : DiagGroup<"thread-safety-beta">;
 
 // Uniqueness Analysis warnings
 def Consumed       : DiagGroup<"consumed">;
 
 // Note that putting warnings in -Wall will not disable them by default. If a
 // warning should be active _only_ when -Wall is passed in, mark it as
 // DefaultIgnore in addition to putting it here.
 def All : DiagGroup<"all", [Most, Parentheses, Switch, SwitchBool]>;
 
 // Warnings that should be in clang-cl /w4.
 def : DiagGroup<"CL4", [All, Extra]>;
 
 // Warnings enabled by -pedantic.  This is magically filled in by TableGen.
 def Pedantic : DiagGroup<"pedantic">;
 
 // Aliases.
 def : DiagGroup<"", [Extra]>;                   // -W = -Wextra
 def : DiagGroup<"endif-labels", [ExtraTokens]>; // -Wendif-labels=-Wextra-tokens
 def : DiagGroup<"cpp", [PoundWarning]>;         // -Wcpp = -W#warnings
 def : DiagGroup<"comments", [Comment]>;         // -Wcomments = -Wcomment
 def : DiagGroup<"conversion-null",
                 [NullConversion]>; // -Wconversion-null = -Wnull-conversion
 def : DiagGroup<"bool-conversions",
                 [BoolConversion]>; // -Wbool-conversions  = -Wbool-conversion
 def : DiagGroup<"int-conversions",
                 [IntConversion]>; // -Wint-conversions = -Wint-conversion
 def : DiagGroup<"vector-conversions",
                 [VectorConversion]>; // -Wvector-conversions = -Wvector-conversion
 def : DiagGroup<"unused-local-typedefs", [UnusedLocalTypedef]>;
                 // -Wunused-local-typedefs = -Wunused-local-typedef
 
 // A warning group for warnings that we want to have on by default in clang,
 // but which aren't on by default in GCC.
 def NonGCC : DiagGroup<"non-gcc",
     [SignCompare, Conversion, LiteralRange]>;
 
 // A warning group for warnings about using C++11 features as extensions in
 // earlier C++ versions.
 def CXX11 : DiagGroup<"c++11-extensions", [CXX11ExtraSemi, CXX11InlineNamespace,
                                            CXX11LongLong]>;
 
 // A warning group for warnings about using C++14 features as extensions in
 // earlier C++ versions.
 def CXX14 : DiagGroup<"c++14-extensions", [CXX14BinaryLiteral]>;
 
 // A warning group for warnings about using C++17 features as extensions in
 // earlier C++ versions.
 def CXX17 : DiagGroup<"c++17-extensions">;
 
 // A warning group for warnings about using C++2a features as extensions in
 // earlier C++ versions.
 def CXX2a : DiagGroup<"c++2a-extensions">;
 
 def : DiagGroup<"c++0x-extensions", [CXX11]>;
 def : DiagGroup<"c++1y-extensions", [CXX14]>;
 def : DiagGroup<"c++1z-extensions", [CXX17]>;
 
 def DelegatingCtorCycles :
   DiagGroup<"delegating-ctor-cycles">;
 
 // A warning group for warnings about using C11 features as extensions.
 def C11 : DiagGroup<"c11-extensions">;
 
 // A warning group for warnings about using C99 features as extensions.
 def C99 : DiagGroup<"c99-extensions">;
 
 // A warning group for warnings about GCC extensions.
 def GNU : DiagGroup<"gnu", [GNUAlignofExpression, GNUAnonymousStruct,
                             GNUAutoType,
                             GNUBinaryLiteral, GNUCaseRange,
                             GNUComplexInteger, GNUCompoundLiteralInitializer,
                             GNUConditionalOmittedOperand, GNUDesignator,
                             GNUEmptyInitializer, GNUEmptyStruct,
                             VLAExtension, GNUFlexibleArrayInitializer,
                             GNUFlexibleArrayUnionMember, GNUFoldingConstant,
                             GNUImaginaryConstant, GNUIncludeNext,
                             GNULabelsAsValue,
                             RedeclaredClassMember, GNURedeclaredEnum,
                             GNUStatementExpression, GNUStaticFloatInit,
                             GNUStringLiteralOperatorTemplate,
                             GNUUnionCast, GNUVariableSizedTypeNotAtEnd,
                             ZeroLengthArray, GNUZeroLineDirective,
                             GNUZeroVariadicMacroArguments]>;
 // A warning group for warnings about code that clang accepts but gcc doesn't.
 def GccCompat : DiagGroup<"gcc-compat">;
 
 // Warnings for Microsoft extensions.
 def MicrosoftCharize : DiagGroup<"microsoft-charize">;
 def MicrosoftInclude : DiagGroup<"microsoft-include">;
 def MicrosoftCppMacro : DiagGroup<"microsoft-cpp-macro">;
 def MicrosoftFixedEnum : DiagGroup<"microsoft-fixed-enum">;
 def MicrosoftSealed : DiagGroup<"microsoft-sealed">;
 def MicrosoftUnqualifiedFriend : DiagGroup<"microsoft-unqualified-friend">;
 def MicrosoftExceptionSpec : DiagGroup<"microsoft-exception-spec">;
 def MicrosoftUsingDecl : DiagGroup<"microsoft-using-decl">;
 def MicrosoftMutableReference : DiagGroup<"microsoft-mutable-reference">;
 def MicrosoftPureDefinition : DiagGroup<"microsoft-pure-definition">;
 def MicrosoftUnionMemberReference : DiagGroup<
     "microsoft-union-member-reference">;
 def MicrosoftExplicitConstructorCall : DiagGroup<
     "microsoft-explicit-constructor-call">;
 def MicrosoftEnumValue : DiagGroup<"microsoft-enum-value">;
 def MicrosoftDefaultArgRedefinition :
     DiagGroup<"microsoft-default-arg-redefinition">;
 def MicrosoftTemplate : DiagGroup<"microsoft-template">;
 def MicrosoftInconsistentDllImport : DiagGroup<"inconsistent-dllimport">;
 def MicrosoftRedeclareStatic : DiagGroup<"microsoft-redeclare-static">;
 def MicrosoftEnumForwardReference :
     DiagGroup<"microsoft-enum-forward-reference">;
 def MicrosoftGoto : DiagGroup<"microsoft-goto">;
 def MicrosoftFlexibleArray : DiagGroup<"microsoft-flexible-array">;
 def MicrosoftExtraQualification : DiagGroup<"microsoft-extra-qualification">;
 def MicrosoftCast : DiagGroup<"microsoft-cast">;
 def MicrosoftConstInit : DiagGroup<"microsoft-const-init">;
 def MicrosoftVoidPseudoDtor : DiagGroup<"microsoft-void-pseudo-dtor">;
 def MicrosoftAnonTag : DiagGroup<"microsoft-anon-tag">;
 def MicrosoftCommentPaste : DiagGroup<"microsoft-comment-paste">;
 def MicrosoftEndOfFile : DiagGroup<"microsoft-end-of-file">;
 def MicrosoftInaccessibleBase : DiagGroup<"microsoft-inaccessible-base">;
 // Aliases.
 def : DiagGroup<"msvc-include", [MicrosoftInclude]>;
                 // -Wmsvc-include = -Wmicrosoft-include
 
 // Warnings group for warnings about Microsoft extensions.
 def Microsoft : DiagGroup<"microsoft",
     [MicrosoftCharize, MicrosoftInclude, MicrosoftCppMacro, MicrosoftFixedEnum,
      MicrosoftSealed, MicrosoftUnqualifiedFriend, MicrosoftExceptionSpec,
      MicrosoftUsingDecl, MicrosoftMutableReference, MicrosoftPureDefinition,
      MicrosoftUnionMemberReference, MicrosoftExplicitConstructorCall,
      MicrosoftEnumValue, MicrosoftDefaultArgRedefinition, MicrosoftTemplate,
      MicrosoftRedeclareStatic, MicrosoftEnumForwardReference, MicrosoftGoto,
      MicrosoftFlexibleArray, MicrosoftExtraQualification, MicrosoftCast,
      MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag,
      MicrosoftCommentPaste, MicrosoftEndOfFile,
      MicrosoftInconsistentDllImport]>;
 
 def ClangClPch : DiagGroup<"clang-cl-pch">;
 
 def ObjCNonUnifiedException : DiagGroup<"objc-nonunified-exceptions">;
 
 def ObjCProtocolMethodImpl : DiagGroup<"objc-protocol-method-implementation">;
 
 def ObjCNoPropertyAutoSynthesis : DiagGroup<"objc-property-synthesis">;
 
 // ObjC API warning groups.
 def ObjCRedundantLiteralUse : DiagGroup<"objc-redundant-literal-use">;
 def ObjCRedundantAPIUse : DiagGroup<"objc-redundant-api-use", [
     ObjCRedundantLiteralUse
   ]>;
 
 def ObjCCocoaAPI : DiagGroup<"objc-cocoa-api", [
     ObjCRedundantAPIUse
   ]>;
 
 def ObjCStringComparison : DiagGroup<"objc-string-compare">;
 def ObjCStringConcatenation : DiagGroup<"objc-string-concatenation">;
 def ObjCLiteralComparison : DiagGroup<"objc-literal-compare", [
     ObjCStringComparison
   ]>;
 
 // Inline ASM warnings.
 def ASMOperandWidths : DiagGroup<"asm-operand-widths">;
 def ASMIgnoredQualifier : DiagGroup<"asm-ignored-qualifier">;
 def ASM : DiagGroup<"asm", [
     ASMOperandWidths, ASMIgnoredQualifier
   ]>;
 
 // OpenMP warnings.
 def SourceUsesOpenMP : DiagGroup<"source-uses-openmp">;
 def OpenMPClauses : DiagGroup<"openmp-clauses">;
 def OpenMPLoopForm : DiagGroup<"openmp-loop-form">;
 def OpenMPTarget : DiagGroup<"openmp-target">;
 
 // Backend warnings.
 def BackendInlineAsm : DiagGroup<"inline-asm">;
 def BackendFrameLargerThanEQ : DiagGroup<"frame-larger-than=">;
 def BackendPlugin : DiagGroup<"backend-plugin">;
 def RemarkBackendPlugin : DiagGroup<"remark-backend-plugin">;
 def BackendOptimizationRemark : DiagGroup<"pass">;
 def BackendOptimizationRemarkMissed : DiagGroup<"pass-missed">;
 def BackendOptimizationRemarkAnalysis : DiagGroup<"pass-analysis">;
 def BackendOptimizationFailure : DiagGroup<"pass-failed">;
 
 // Instrumentation based profiling warnings.
 def ProfileInstrMissing : DiagGroup<"profile-instr-missing">;
 def ProfileInstrOutOfDate : DiagGroup<"profile-instr-out-of-date">;
 def ProfileInstrUnprofiled : DiagGroup<"profile-instr-unprofiled">;
 
 // AddressSanitizer frontend instrumentation remarks.
 def SanitizeAddressRemarks : DiagGroup<"sanitize-address">;
 
 // Issues with serialized diagnostics.
 def SerializedDiagnostics : DiagGroup<"serialized-diagnostics">;
 
 // A warning group for warnings about code that clang accepts when
 // compiling CUDA C/C++ but which is not compatible with the CUDA spec.
 def CudaCompat : DiagGroup<"cuda-compat">;
 
 // A warning group for things that will change semantics in the future.
 def FutureCompat : DiagGroup<"future-compat">;
 
 def InvalidOrNonExistentDirectory : DiagGroup<"invalid-or-nonexistent-directory">;
 
 def OptionIgnored : DiagGroup<"option-ignored">;
 
 def UnknownArgument : DiagGroup<"unknown-argument">;
 
 // A warning group for warnings about code that clang accepts when
 // compiling OpenCL C/C++ but which is not compatible with the SPIR spec.
 def SpirCompat : DiagGroup<"spir-compat">;
+
+// Warning for the experimental-isel options.
+def ExperimentalISel : DiagGroup<"experimental-isel">;
Index: head/contrib/llvm/tools/clang/include/clang/Driver/Options.td
===================================================================
--- head/contrib/llvm/tools/clang/include/clang/Driver/Options.td	(revision 328752)
+++ head/contrib/llvm/tools/clang/include/clang/Driver/Options.td	(revision 328753)
@@ -1,2787 +1,2791 @@
 //===--- Options.td - Options for clang -----------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 //  This file defines the options accepted by clang.
 //
 //===----------------------------------------------------------------------===//
 
 // Include the common option parsing interfaces.
 include "llvm/Option/OptParser.td"
 
 /////////
 // Flags
 
 // DriverOption - The option is a "driver" option, and should not be forwarded
 // to other tools.
 def DriverOption : OptionFlag;
 
 // LinkerInput - The option is a linker input.
 def LinkerInput : OptionFlag;
 
 // NoArgumentUnused - Don't report argument unused warnings for this option; this
 // is useful for options like -static or -dynamic which a user may always end up
 // passing, even if the platform defaults to (or only supports) that option.
 def NoArgumentUnused : OptionFlag;
 
 // Unsupported - The option is unsupported, and the driver will reject command
 // lines that use it.
 def Unsupported : OptionFlag;
 
 // Ignored - The option is unsupported, and the driver will silently ignore it.
 def Ignored : OptionFlag;
 
 // CoreOption - This is considered a "core" Clang option, available in both
 // clang and clang-cl modes.
 def CoreOption : OptionFlag;
 
 // CLOption - This is a cl.exe compatibility option. Options with this flag
 // are made available when the driver is running in CL compatibility mode.
 def CLOption : OptionFlag;
 
 // CC1Option - This option should be accepted by clang -cc1.
 def CC1Option : OptionFlag;
 
 // CC1AsOption - This option should be accepted by clang -cc1as.
 def CC1AsOption : OptionFlag;
 
 // NoDriverOption - This option should not be accepted by the driver.
 def NoDriverOption : OptionFlag;
 
 // A short name to show in documentation. The name will be interpreted as rST.
 class DocName<string name> { string DocName = name; }
 
 // A brief description to show in documentation, interpreted as rST.
 class DocBrief<code descr> { code DocBrief = descr; }
 
 // Indicates that this group should be flattened into its parent when generating
 // documentation.
 class DocFlatten { bit DocFlatten = 1; }
 
 // Indicates that this warning is ignored, but accepted with a warning for
 // GCC compatibility.
 class IgnoredGCCCompat : Flags<[HelpHidden]> {}
 
 /////////
 // Groups
 
 def Action_Group : OptionGroup<"<action group>">, DocName<"Actions">,
                    DocBrief<[{The action to perform on the input.}]>;
 
 // Meta-group for options which are only used for compilation,
 // and not linking etc.
 def CompileOnly_Group : OptionGroup<"<CompileOnly group>">,
                         DocName<"Compilation flags">, DocBrief<[{
 Flags controlling the behavior of Clang during compilation. These flags have
 no effect during actions that do not perform compilation.}]>;
 
 def Preprocessor_Group : OptionGroup<"<Preprocessor group>">,
                          Group<CompileOnly_Group>,
                          DocName<"Preprocessor flags">, DocBrief<[{
 Flags controlling the behavior of the Clang preprocessor.}]>;
 
 def IncludePath_Group : OptionGroup<"<I/i group>">, Group<Preprocessor_Group>,
                         DocName<"Include path management">,
                         DocBrief<[{
 Flags controlling how ``#include``\s are resolved to files.}]>;
 
 def I_Group : OptionGroup<"<I group>">, Group<IncludePath_Group>, DocFlatten;
 def i_Group : OptionGroup<"<i group>">, Group<IncludePath_Group>, DocFlatten;
 def clang_i_Group : OptionGroup<"<clang i group>">, Group<i_Group>, DocFlatten;
 
 def M_Group : OptionGroup<"<M group>">, Group<Preprocessor_Group>,
               DocName<"Dependency file generation">, DocBrief<[{
 Flags controlling generation of a dependency file for ``make``-like build
 systems.}]>;
 
 def d_Group : OptionGroup<"<d group>">, Group<Preprocessor_Group>,
               DocName<"Dumping preprocessor state">, DocBrief<[{
 Flags allowing the state of the preprocessor to be dumped in various ways.}]>;
 
 def Diag_Group : OptionGroup<"<W/R group>">, Group<CompileOnly_Group>,
                  DocName<"Diagnostic flags">, DocBrief<[{
 Flags controlling which warnings, errors, and remarks Clang will generate.
 See the :doc:`full list of warning and remark flags <DiagnosticsReference>`.}]>;
 
 def R_Group : OptionGroup<"<R group>">, Group<Diag_Group>, DocFlatten;
 def R_value_Group : OptionGroup<"<R (with value) group>">, Group<R_Group>,
                     DocFlatten;
 def W_Group : OptionGroup<"<W group>">, Group<Diag_Group>, DocFlatten;
 def W_value_Group : OptionGroup<"<W (with value) group>">, Group<W_Group>,
                     DocFlatten;
 
 def f_Group : OptionGroup<"<f group>">, Group<CompileOnly_Group>,
               DocName<"Target-independent compilation options">;
 
 def f_clang_Group : OptionGroup<"<f (clang-only) group>">,
                     Group<CompileOnly_Group>, DocFlatten;
 def pedantic_Group : OptionGroup<"<pedantic group>">, Group<f_Group>,
                      DocFlatten;
 def opencl_Group : OptionGroup<"<opencl group>">, Group<f_Group>,
                    DocName<"OpenCL flags">;
 
 def m_Group : OptionGroup<"<m group>">, Group<CompileOnly_Group>,
               DocName<"Target-dependent compilation options">;
 
 // Feature groups - these take command line options that correspond directly to
 // target specific features and can be translated directly from command line
 // options.
 def m_aarch64_Features_Group : OptionGroup<"<aarch64 features group>">,
                                Group<m_Group>, DocName<"AARCH64">;
 def m_amdgpu_Features_Group : OptionGroup<"<amdgpu features group>">,
                               Group<m_Group>, DocName<"AMDGPU">;
 def m_arm_Features_Group : OptionGroup<"<arm features group>">,
                            Group<m_Group>, DocName<"ARM">;
 def m_hexagon_Features_Group : OptionGroup<"<hexagon features group>">,
                                Group<m_Group>, DocName<"Hexagon">;
 // The features added by this group will not be added to target features.
 // These are explicitly handled.
 def m_hexagon_Features_HVX_Group : OptionGroup<"<hexagon features group>">,
                                    Group<m_Group>, DocName<"Hexagon">;
 def m_ppc_Features_Group : OptionGroup<"<ppc features group>">,
                            Group<m_Group>, DocName<"PowerPC">;
 def m_wasm_Features_Group : OptionGroup<"<wasm features group>">,
                             Group<m_Group>, DocName<"WebAssembly">;
 def m_x86_Features_Group : OptionGroup<"<x86 features group>">,
                            Group<m_Group>, Flags<[CoreOption]>, DocName<"X86">;
 
 def m_libc_Group : OptionGroup<"<m libc group>">, Group<m_Group>,
                    Flags<[HelpHidden]>;
 
 def O_Group : OptionGroup<"<O group>">, Group<CompileOnly_Group>,
               DocName<"Optimization level">, DocBrief<[{
 Flags controlling how much optimization should be performed.}]>;
 
 def DebugInfo_Group : OptionGroup<"<g group>">, Group<CompileOnly_Group>,
                       DocName<"Debug information generation">, DocBrief<[{
 Flags controlling how much and what kind of debug information should be
 generated.}]>;
 
 def g_Group : OptionGroup<"<g group>">, Group<DebugInfo_Group>,
               DocName<"Kind and level of debug information">;
 def gN_Group : OptionGroup<"<gN group>">, Group<g_Group>,
                DocName<"Debug level">;
 def ggdbN_Group : OptionGroup<"<ggdbN group>">, Group<gN_Group>, DocFlatten;
 def gTune_Group : OptionGroup<"<gTune group>">, Group<g_Group>,
                   DocName<"Debugger to tune debug information for">;
 def g_flags_Group : OptionGroup<"<g flags group>">, Group<DebugInfo_Group>,
                     DocName<"Debug information flags">;
 
 def StaticAnalyzer_Group : OptionGroup<"<Static analyzer group>">,
                            DocName<"Static analyzer flags">, DocBrief<[{
 Flags controlling the behavior of the Clang Static Analyzer.}]>;
 
 // gfortran options that we recognize in the driver and pass along when
 // invoking GCC to compile Fortran code.
 def gfortran_Group : OptionGroup<"<gfortran group>">,
                      DocName<"Fortran compilation flags">, DocBrief<[{
 Flags that will be passed onto the ``gfortran`` compiler when Clang is given
 a Fortran input.}]>;
 
 def Link_Group : OptionGroup<"<T/e/s/t/u group>">, DocName<"Linker flags">,
                  DocBrief<[{Flags that are passed on to the linker}]>;
 def T_Group : OptionGroup<"<T group>">, Group<Link_Group>, DocFlatten;
 def u_Group : OptionGroup<"<u group>">, Group<Link_Group>, DocFlatten;
 
 def reserved_lib_Group : OptionGroup<"<reserved libs group>">,
                          Flags<[Unsupported]>;
 
 // Temporary groups for clang options which we know we don't support,
 // but don't want to verbosely warn the user about.
 def clang_ignored_f_Group : OptionGroup<"<clang ignored f group>">,
   Group<f_Group>, Flags<[Ignored]>;
 def clang_ignored_m_Group : OptionGroup<"<clang ignored m group>">,
   Group<m_Group>, Flags<[Ignored]>;
 
 // Group for clang options in the process of deprecation.
 // Please include the version that deprecated the flag as comment to allow
 // easier garbage collection.
 def clang_ignored_legacy_options_Group : OptionGroup<"<clang legacy flags>">,
   Group<f_Group>, Flags<[Ignored]>;
 
 // Retired with clang-5.0
 def : Flag<["-"], "fslp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;
 def : Flag<["-"], "fno-slp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;
 
 // Group that ignores all gcc optimizations that won't be implemented
 def clang_ignored_gcc_optimization_f_Group : OptionGroup<
   "<clang_ignored_gcc_optimization_f_Group>">, Group<f_Group>, Flags<[Ignored]>;
 
 /////////
 // Options
 
 // The internal option ID must be a valid C++ identifier and results in a
 // clang::driver::options::OPT_XX enum constant for XX.
 //
 // We want to unambiguously be able to refer to options from the driver source
 // code, for this reason the option name is mangled into an ID. This mangling
 // isn't guaranteed to have an inverse, but for practical purposes it does.
 //
 // The mangling scheme is to ignore the leading '-', and perform the following
 // substitutions:
 //   _ => __
 //   - => _
 //   / => _SLASH
 //   # => _HASH
 //   ? => _QUESTION
 //   , => _COMMA
 //   = => _EQ
 //   C++ => CXX
 //   . => _
 
 // Developer Driver Options
 
 def internal_Group : OptionGroup<"<clang internal options>">, Flags<[HelpHidden]>;
 def internal_driver_Group : OptionGroup<"<clang driver internal options>">,
   Group<internal_Group>, HelpText<"DRIVER OPTIONS">;
 def internal_debug_Group :
   OptionGroup<"<clang debug/development internal options>">,
   Group<internal_Group>, HelpText<"DEBUG/DEVELOPMENT OPTIONS">;
 
 class InternalDriverOpt : Group<internal_driver_Group>,
   Flags<[DriverOption, HelpHidden]>;
 def driver_mode : Joined<["--"], "driver-mode=">, Group<internal_driver_Group>,
   Flags<[CoreOption, DriverOption, HelpHidden]>,
   HelpText<"Set the driver mode to either 'gcc', 'g++', 'cpp', or 'cl'">;
 def rsp_quoting : Joined<["--"], "rsp-quoting=">, Group<internal_driver_Group>,
   Flags<[CoreOption, DriverOption, HelpHidden]>,
   HelpText<"Set the rsp quoting to either 'posix', or 'windows'">;
 def ccc_gcc_name : Separate<["-"], "ccc-gcc-name">, InternalDriverOpt,
   HelpText<"Name for native GCC compiler">,
   MetaVarName<"<gcc-path>">;
 def ccc_pch_is_pch : Flag<["-"], "ccc-pch-is-pch">, InternalDriverOpt,
   HelpText<"Use lazy PCH for precompiled headers">;
 def ccc_pch_is_pth : Flag<["-"], "ccc-pch-is-pth">, InternalDriverOpt,
   HelpText<"Use pretokenized headers for precompiled headers">;
 
 class InternalDebugOpt : Group<internal_debug_Group>,
   Flags<[DriverOption, HelpHidden, CoreOption]>;
 def ccc_install_dir : Separate<["-"], "ccc-install-dir">, InternalDebugOpt,
   HelpText<"Simulate installation in the given directory">;
 def ccc_print_phases : Flag<["-"], "ccc-print-phases">, InternalDebugOpt,
   HelpText<"Dump list of actions to perform">;
 def ccc_print_bindings : Flag<["-"], "ccc-print-bindings">, InternalDebugOpt,
   HelpText<"Show bindings of tools to actions">;
 
 def ccc_arcmt_check : Flag<["-"], "ccc-arcmt-check">, InternalDriverOpt,
   HelpText<"Check for ARC migration issues that need manual handling">;
 def ccc_arcmt_modify : Flag<["-"], "ccc-arcmt-modify">, InternalDriverOpt,
   HelpText<"Apply modifications to files to conform to ARC">;
 def ccc_arcmt_migrate : Separate<["-"], "ccc-arcmt-migrate">, InternalDriverOpt,
   HelpText<"Apply modifications and produces temporary files that conform to ARC">;
 def arcmt_migrate_report_output : Separate<["-"], "arcmt-migrate-report-output">,
   HelpText<"Output path for the plist report">,  Flags<[CC1Option]>;
 def arcmt_migrate_emit_arc_errors : Flag<["-"], "arcmt-migrate-emit-errors">,
   HelpText<"Emit ARC errors even if the migrator can fix them">,
   Flags<[CC1Option]>;
 def gen_reproducer: Flag<["-"], "gen-reproducer">, InternalDebugOpt,
   HelpText<"Auto-generates preprocessed source files and a reproduction script">;
 
 def _migrate : Flag<["--"], "migrate">, Flags<[DriverOption]>,
   HelpText<"Run the migrator">;
 def ccc_objcmt_migrate : Separate<["-"], "ccc-objcmt-migrate">,
   InternalDriverOpt,
   HelpText<"Apply modifications and produces temporary files to migrate to "
    "modern ObjC syntax">;
 def objcmt_migrate_literals : Flag<["-"], "objcmt-migrate-literals">, Flags<[CC1Option]>,
   HelpText<"Enable migration to modern ObjC literals">;
 def objcmt_migrate_subscripting : Flag<["-"], "objcmt-migrate-subscripting">, Flags<[CC1Option]>,
   HelpText<"Enable migration to modern ObjC subscripting">;
 def objcmt_migrate_property : Flag<["-"], "objcmt-migrate-property">, Flags<[CC1Option]>,
   HelpText<"Enable migration to modern ObjC property">;
 def objcmt_migrate_all : Flag<["-"], "objcmt-migrate-all">, Flags<[CC1Option]>,
   HelpText<"Enable migration to modern ObjC">;
 def objcmt_migrate_readonly_property : Flag<["-"], "objcmt-migrate-readonly-property">, Flags<[CC1Option]>,
   HelpText<"Enable migration to modern ObjC readonly property">;
 def objcmt_migrate_readwrite_property : Flag<["-"], "objcmt-migrate-readwrite-property">, Flags<[CC1Option]>,
   HelpText<"Enable migration to modern ObjC readwrite property">;
 def objcmt_migrate_property_dot_syntax : Flag<["-"], "objcmt-migrate-property-dot-syntax">, Flags<[CC1Option]>,
   HelpText<"Enable migration of setter/getter messages to property-dot syntax">;
 def objcmt_migrate_annotation : Flag<["-"], "objcmt-migrate-annotation">, Flags<[CC1Option]>,
   HelpText<"Enable migration to property and method annotations">;
 def objcmt_migrate_instancetype : Flag<["-"], "objcmt-migrate-instancetype">, Flags<[CC1Option]>,
   HelpText<"Enable migration to infer instancetype for method result type">;
 def objcmt_migrate_nsmacros : Flag<["-"], "objcmt-migrate-ns-macros">, Flags<[CC1Option]>,
   HelpText<"Enable migration to NS_ENUM/NS_OPTIONS macros">;
 def objcmt_migrate_protocol_conformance : Flag<["-"], "objcmt-migrate-protocol-conformance">, Flags<[CC1Option]>,
   HelpText<"Enable migration to add protocol conformance on classes">;
 def objcmt_atomic_property : Flag<["-"], "objcmt-atomic-property">, Flags<[CC1Option]>,
   HelpText<"Make migration to 'atomic' properties">;
 def objcmt_returns_innerpointer_property : Flag<["-"], "objcmt-returns-innerpointer-property">, Flags<[CC1Option]>,
   HelpText<"Enable migration to annotate property with NS_RETURNS_INNER_POINTER">;
 def objcmt_ns_nonatomic_iosonly: Flag<["-"], "objcmt-ns-nonatomic-iosonly">, Flags<[CC1Option]>,
   HelpText<"Enable migration to use NS_NONATOMIC_IOSONLY macro for setting property's 'atomic' attribute">;
 def objcmt_migrate_designated_init : Flag<["-"], "objcmt-migrate-designated-init">, Flags<[CC1Option]>,
   HelpText<"Enable migration to infer NS_DESIGNATED_INITIALIZER for initializer methods">;
 def objcmt_whitelist_dir_path: Joined<["-"], "objcmt-whitelist-dir-path=">, Flags<[CC1Option]>,
   HelpText<"Only modify files with a filename contained in the provided directory path">;
 // The misspelt "white-list" [sic] alias is due for removal.
 def : Joined<["-"], "objcmt-white-list-dir-path=">, Flags<[CC1Option]>,
     Alias<objcmt_whitelist_dir_path>;
 
 // Make sure all other -ccc- options are rejected.
 def ccc_ : Joined<["-"], "ccc-">, Group<internal_Group>, Flags<[Unsupported]>;
 
 // Standard Options
 
 def _HASH_HASH_HASH : Flag<["-"], "###">, Flags<[DriverOption, CoreOption]>,
     HelpText<"Print (but do not run) the commands to run for this compilation">;
 def _DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>,
     Flags<[DriverOption, CoreOption]>;
 def A : JoinedOrSeparate<["-"], "A">, Flags<[RenderJoined]>, Group<gfortran_Group>;
 def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"<dir>">,
     HelpText<"Add <dir> to search path for binaries and object files used implicitly">;
 def CC : Flag<["-"], "CC">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
     HelpText<"Include comments from within macros in preprocessed output">;
 def C : Flag<["-"], "C">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
     HelpText<"Include comments in preprocessed output">;
 def D : JoinedOrSeparate<["-"], "D">, Group<Preprocessor_Group>,
     Flags<[CC1Option]>, MetaVarName<"<macro>=<value>">,
     HelpText<"Define <macro> to <value> (or 1 if <value> omitted)">;
 def E : Flag<["-"], "E">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
     HelpText<"Only run the preprocessor">;
 def F : JoinedOrSeparate<["-"], "F">, Flags<[RenderJoined,CC1Option]>,
     HelpText<"Add directory to framework include search path">;
 def G : JoinedOrSeparate<["-"], "G">, Flags<[DriverOption]>, Group<m_Group>,
     MetaVarName<"<size>">, HelpText<"Put objects of at most <size> bytes "
     "into small data section (MIPS / Hexagon)">;
 def G_EQ : Joined<["-"], "G=">, Flags<[DriverOption]>, Group<m_Group>, Alias<G>;
 def H : Flag<["-"], "H">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
     HelpText<"Show header includes and nesting depth">;
 def I_ : Flag<["-"], "I-">, Group<I_Group>,
     HelpText<"Restrict all prior -I flags to double-quoted inclusion and "
              "remove current directory from include path">;
 def I : JoinedOrSeparate<["-"], "I">, Group<I_Group>,
     Flags<[CC1Option,CC1AsOption]>, MetaVarName<"<dir>">,
     HelpText<"Add directory to include search path">;
 def L : JoinedOrSeparate<["-"], "L">, Flags<[RenderJoined]>, Group<Link_Group>,
     MetaVarName<"<dir>">, HelpText<"Add directory to library search path">;
 def MD : Flag<["-"], "MD">, Group<M_Group>,
     HelpText<"Write a depfile containing user and system headers">;
 def MMD : Flag<["-"], "MMD">, Group<M_Group>,
     HelpText<"Write a depfile containing user headers">;
 def M : Flag<["-"], "M">, Group<M_Group>,
     HelpText<"Like -MD, but also implies -E and writes to stdout by default">;
 def MM : Flag<["-"], "MM">, Group<M_Group>,
     HelpText<"Like -MMD, but also implies -E and writes to stdout by default">;
 def MF : JoinedOrSeparate<["-"], "MF">, Group<M_Group>,
     HelpText<"Write depfile output from -MMD, -MD, -MM, or -M to <file>">,
     MetaVarName<"<file>">;
 def MG : Flag<["-"], "MG">, Group<M_Group>, Flags<[CC1Option]>,
     HelpText<"Add missing headers to depfile">;
 def MJ : JoinedOrSeparate<["-"], "MJ">, Group<M_Group>,
     HelpText<"Write a compilation database entry per input">;
 def MP : Flag<["-"], "MP">, Group<M_Group>, Flags<[CC1Option]>,
     HelpText<"Create phony target for each dependency (other than main file)">;
 def MQ : JoinedOrSeparate<["-"], "MQ">, Group<M_Group>, Flags<[CC1Option]>,
     HelpText<"Specify name of main file output to quote in depfile">;
 def MT : JoinedOrSeparate<["-"], "MT">, Group<M_Group>, Flags<[CC1Option]>,
     HelpText<"Specify name of main file output in depfile">;
 def MV : Flag<["-"], "MV">, Group<M_Group>, Flags<[CC1Option]>,
     HelpText<"Use NMake/Jom format for the depfile">;
 def Mach : Flag<["-"], "Mach">, Group<Link_Group>;
 def O0 : Flag<["-"], "O0">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
 def O4 : Flag<["-"], "O4">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
 def ObjCXX : Flag<["-"], "ObjC++">, Flags<[DriverOption]>,
   HelpText<"Treat source input files as Objective-C++ inputs">;
 def ObjC : Flag<["-"], "ObjC">, Flags<[DriverOption]>,
   HelpText<"Treat source input files as Objective-C inputs">;
 def O : Joined<["-"], "O">, Group<O_Group>, Flags<[CC1Option]>;
 def O_flag : Flag<["-"], "O">, Flags<[CC1Option]>, Alias<O>, AliasArgs<["2"]>;
 def Ofast : Joined<["-"], "Ofast">, Group<O_Group>, Flags<[CC1Option]>;
 def P : Flag<["-"], "P">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
   HelpText<"Disable linemarker output in -E mode">;
 def Qn : Flag<["-"], "Qn">, IgnoredGCCCompat;
 def Qunused_arguments : Flag<["-"], "Qunused-arguments">, Flags<[DriverOption, CoreOption]>,
   HelpText<"Don't emit warning for unused driver arguments">;
 def Q : Flag<["-"], "Q">, IgnoredGCCCompat;
 def Rpass_EQ : Joined<["-"], "Rpass=">, Group<R_value_Group>, Flags<[CC1Option]>,
   HelpText<"Report transformations performed by optimization passes whose "
            "name matches the given POSIX regular expression">;
 def Rpass_missed_EQ : Joined<["-"], "Rpass-missed=">, Group<R_value_Group>,
   Flags<[CC1Option]>,
   HelpText<"Report missed transformations by optimization passes whose "
            "name matches the given POSIX regular expression">;
 def Rpass_analysis_EQ : Joined<["-"], "Rpass-analysis=">, Group<R_value_Group>,
   Flags<[CC1Option]>,
   HelpText<"Report transformation analysis from optimization passes whose "
            "name matches the given POSIX regular expression">;
 def R_Joined : Joined<["-"], "R">, Group<R_Group>, Flags<[CC1Option, CoreOption]>,
   MetaVarName<"<remark>">, HelpText<"Enable the specified remark">;
 def S : Flag<["-"], "S">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
   HelpText<"Only run preprocess and compilation steps">;
 def Tbss : JoinedOrSeparate<["-"], "Tbss">, Group<T_Group>,
   MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
 def Tdata : JoinedOrSeparate<["-"], "Tdata">, Group<T_Group>,
   MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
 def Ttext : JoinedOrSeparate<["-"], "Ttext">, Group<T_Group>,
   MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
 def T : JoinedOrSeparate<["-"], "T">, Group<T_Group>,
   MetaVarName<"<script>">, HelpText<"Specify <script> as linker script">;
 def U : JoinedOrSeparate<["-"], "U">, Group<Preprocessor_Group>,
   Flags<[CC1Option]>, MetaVarName<"<macro>">, HelpText<"Undefine macro <macro>">;
 def V : JoinedOrSeparate<["-"], "V">, Flags<[DriverOption, Unsupported]>;
 def Wa_COMMA : CommaJoined<["-"], "Wa,">,
   HelpText<"Pass the comma separated arguments in <arg> to the assembler">,
   MetaVarName<"<arg>">;
 def Wall : Flag<["-"], "Wall">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
 def WCL4 : Flag<["-"], "WCL4">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
 def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<W_Group>, Flags<[CC1Option]>,
   HelpText<"Enable warnings for deprecated constructs and define __DEPRECATED">;
 def Wno_deprecated : Flag<["-"], "Wno-deprecated">, Group<W_Group>, Flags<[CC1Option]>;
 def Wl_COMMA : CommaJoined<["-"], "Wl,">, Flags<[LinkerInput, RenderAsInput]>,
   HelpText<"Pass the comma separated arguments in <arg> to the linker">,
   MetaVarName<"<arg>">, Group<Link_Group>;
 // FIXME: This is broken; these should not be Joined arguments.
 def Wno_nonportable_cfstrings : Joined<["-"], "Wno-nonportable-cfstrings">, Group<W_Group>,
   Flags<[CC1Option]>;
 def Wnonportable_cfstrings : Joined<["-"], "Wnonportable-cfstrings">, Group<W_Group>,
   Flags<[CC1Option]>;
 def Wp_COMMA : CommaJoined<["-"], "Wp,">,
   HelpText<"Pass the comma separated arguments in <arg> to the preprocessor">,
   MetaVarName<"<arg>">, Group<Preprocessor_Group>;
 def Wwrite_strings : Flag<["-"], "Wwrite-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
 def Wno_write_strings : Flag<["-"], "Wno-write-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
 def W_Joined : Joined<["-"], "W">, Group<W_Group>, Flags<[CC1Option, CoreOption]>,
   MetaVarName<"<warning>">, HelpText<"Enable the specified warning">;
 def Xanalyzer : Separate<["-"], "Xanalyzer">,
   HelpText<"Pass <arg> to the static analyzer">, MetaVarName<"<arg>">,
   Group<StaticAnalyzer_Group>;
 def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[DriverOption]>;
 def Xassembler : Separate<["-"], "Xassembler">,
   HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">,
   Group<CompileOnly_Group>;
 def Xclang : Separate<["-"], "Xclang">,
   HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
   Flags<[DriverOption, CoreOption]>, Group<CompileOnly_Group>;
 def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
   HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
 def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
   HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
 def Xopenmp_target : Separate<["-"], "Xopenmp-target">,
   HelpText<"Pass <arg> to the target offloading toolchain.">, MetaVarName<"<arg>">;
 def Xopenmp_target_EQ : JoinedAndSeparate<["-"], "Xopenmp-target=">,
   HelpText<"Pass <arg> to the specified target offloading toolchain. The triple that identifies the toolchain must be provided after the equals sign.">, MetaVarName<"<arg>">;
 def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
   HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">,
   Group<Link_Group>;
 def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
   HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
   Group<Link_Group>;
 def Xpreprocessor : Separate<["-"], "Xpreprocessor">, Group<Preprocessor_Group>,
   HelpText<"Pass <arg> to the preprocessor">, MetaVarName<"<arg>">;
 def X_Flag : Flag<["-"], "X">, Group<Link_Group>;
 def X_Joined : Joined<["-"], "X">, IgnoredGCCCompat;
 def Z_Flag : Flag<["-"], "Z">, Group<Link_Group>;
 // FIXME: All we do with this is reject it. Remove.
 def Z_Joined : Joined<["-"], "Z">;
 def all__load : Flag<["-"], "all_load">;
 def allowable__client : Separate<["-"], "allowable_client">;
 def ansi : Flag<["-", "--"], "ansi">;
 def arch__errors__fatal : Flag<["-"], "arch_errors_fatal">;
 def arch : Separate<["-"], "arch">, Flags<[DriverOption]>;
 def arch__only : Separate<["-"], "arch_only">;
 def a : Joined<["-"], "a">;
 def autocomplete : Joined<["--"], "autocomplete=">;
 def bind__at__load : Flag<["-"], "bind_at_load">;
 def bundle__loader : Separate<["-"], "bundle_loader">;
 def bundle : Flag<["-"], "bundle">;
 def b : JoinedOrSeparate<["-"], "b">, Flags<[Unsupported]>;
 def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">;
 def cl_strict_aliasing : Flag<["-"], "cl-strict-aliasing">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. This option is added for compatibility with OpenCL 1.0.">;
 def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Treat double precision floating-point constant as single precision constant.">;
 def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
 def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Generate kernel argument metadata.">;
 def cl_unsafe_math_optimizations : Flag<["-"], "cl-unsafe-math-optimizations">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Allow unsafe floating-point optimizations.  Also implies -cl-no-signed-zeros and -cl-mad-enable.">;
 def cl_fast_relaxed_math : Flag<["-"], "cl-fast-relaxed-math">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__.">;
 def cl_mad_enable : Flag<["-"], "cl-mad-enable">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Allow use of less precise MAD computations in the generated binary.">;
 def cl_no_signed_zeros : Flag<["-"], "cl-no-signed-zeros">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">;
 def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL language standard to compile for.">, Values<"cl,CL,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0">;
 def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
 def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
 def client__name : JoinedOrSeparate<["-"], "client_name">;
 def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
 def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
 def config : Separate<["--"], "config">, Flags<[DriverOption]>,
   HelpText<"Specifies configuration file">;
 def config_system_dir_EQ : Joined<["--"], "config-system-dir=">, Flags<[DriverOption, HelpHidden]>,
   HelpText<"System directory for configuration files">;
 def config_user_dir_EQ : Joined<["--"], "config-user-dir=">, Flags<[DriverOption, HelpHidden]>,
   HelpText<"User directory for configuration files">;
 def coverage : Flag<["-", "--"], "coverage">, Flags<[CoreOption]>;
 def cpp_precomp : Flag<["-"], "cpp-precomp">, Group<clang_ignored_f_Group>;
 def current__version : JoinedOrSeparate<["-"], "current_version">;
 def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group<clang_i_Group>,
   HelpText<"Add directory to the C++ SYSTEM include search path">, Flags<[CC1Option]>,
   MetaVarName<"<directory>">;
 def c : Flag<["-"], "c">, Flags<[DriverOption]>, Group<Action_Group>,
   HelpText<"Only run preprocess, compile, and assemble steps">;
 def cuda_device_only : Flag<["--"], "cuda-device-only">,
   HelpText<"Compile CUDA code for device only">;
 def cuda_host_only : Flag<["--"], "cuda-host-only">,
   HelpText<"Compile CUDA code for host only.  Has no effect on non-CUDA "
            "compilations.">;
 def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">,
   HelpText<"Compile CUDA code for both host and device (default).  Has no "
            "effect on non-CUDA compilations.">;
 def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
   HelpText<"CUDA GPU architecture (e.g. sm_35).  May be specified more than once.">;
 def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>,
   HelpText<"Remove GPU architecture (e.g. sm_35) from the list of GPUs to compile for. "
            "'all' resets the list to its default value.">;
 def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">,
   HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;
 def no_cuda_version_check : Flag<["--"], "no-cuda-version-check">,
   HelpText<"Don't error out if the detected version of the CUDA install is "
            "too low for the requested CUDA gpu architecture.">;
 def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;
 def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
   HelpText<"CUDA installation path">;
 def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group<i_Group>,
   HelpText<"Path to ptxas (used for compiling CUDA code)">;
 def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
   Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
 def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
 def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
   Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
 def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
 def dA : Flag<["-"], "dA">, Group<d_Group>;
 def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
   HelpText<"Print macro definitions in -E mode in addition to normal output">;
 def dI : Flag<["-"], "dI">, Group<d_Group>, Flags<[CC1Option]>,
   HelpText<"Print include directives in -E mode in addition to normal output">;
 def dM : Flag<["-"], "dM">, Group<d_Group>, Flags<[CC1Option]>,
   HelpText<"Print macro definitions in -E mode instead of normal output">;
 def dead__strip : Flag<["-"], "dead_strip">;
 def dependency_file : Separate<["-"], "dependency-file">, Flags<[CC1Option]>,
   HelpText<"Filename (or -) to write dependency output to">;
 def dependency_dot : Separate<["-"], "dependency-dot">, Flags<[CC1Option]>,
   HelpText<"Filename to write DOT-formatted header dependencies to">;
 def module_dependency_dir : Separate<["-"], "module-dependency-dir">,
   Flags<[CC1Option]>, HelpText<"Directory to dump module dependencies to">;
 def dumpmachine : Flag<["-"], "dumpmachine">;
 def dumpspecs : Flag<["-"], "dumpspecs">, Flags<[Unsupported]>;
 def dumpversion : Flag<["-"], "dumpversion">;
 def dylib__file : Separate<["-"], "dylib_file">;
 def dylinker__install__name : JoinedOrSeparate<["-"], "dylinker_install_name">;
 def dylinker : Flag<["-"], "dylinker">;
 def dynamiclib : Flag<["-"], "dynamiclib">;
 def dynamic : Flag<["-"], "dynamic">, Flags<[NoArgumentUnused]>;
 def d_Flag : Flag<["-"], "d">, Group<d_Group>;
 def d_Joined : Joined<["-"], "d">, Group<d_Group>;
 def emit_ast : Flag<["-"], "emit-ast">,
   HelpText<"Emit Clang AST files for source inputs">;
 def emit_llvm : Flag<["-"], "emit-llvm">, Flags<[CC1Option]>, Group<Action_Group>,
   HelpText<"Use the LLVM representation for assembler and object files">;
 def exported__symbols__list : Separate<["-"], "exported_symbols_list">;
 def e : JoinedOrSeparate<["-"], "e">, Group<Link_Group>;
 def fPIC : Flag<["-"], "fPIC">, Group<f_Group>;
 def fno_PIC : Flag<["-"], "fno-PIC">, Group<f_Group>;
 def fPIE : Flag<["-"], "fPIE">, Group<f_Group>;
 def fno_PIE : Flag<["-"], "fno-PIE">, Group<f_Group>;
 def faccess_control : Flag<["-"], "faccess-control">, Group<f_Group>;
 def fallow_unsupported : Flag<["-"], "fallow-unsupported">, Group<f_Group>;
 def fapple_kext : Flag<["-"], "fapple-kext">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Use Apple's kernel extensions ABI">;
 def fapple_pragma_pack : Flag<["-"], "fapple-pragma-pack">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable Apple gcc-compatible #pragma pack handling">;
 def shared_libsan : Flag<["-"], "shared-libsan">;
 def static_libsan : Flag<["-"], "static-libsan">;
 def : Flag<["-"], "shared-libasan">, Alias<shared_libsan>;
 def fasm : Flag<["-"], "fasm">, Group<f_Group>;
 
 def fasm_blocks : Flag<["-"], "fasm-blocks">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_asm_blocks : Flag<["-"], "fno-asm-blocks">, Group<f_Group>;
 
 def fassume_sane_operator_new : Flag<["-"], "fassume-sane-operator-new">, Group<f_Group>;
 def fastcp : Flag<["-"], "fastcp">, Group<f_Group>;
 def fastf : Flag<["-"], "fastf">, Group<f_Group>;
 def fast : Flag<["-"], "fast">, Group<f_Group>;
 def fasynchronous_unwind_tables : Flag<["-"], "fasynchronous-unwind-tables">, Group<f_Group>;
 
 def fdouble_square_bracket_attributes : Flag<[ "-" ], "fdouble-square-bracket-attributes">,
   Group<f_Group>, Flags<[DriverOption, CC1Option]>,
   HelpText<"Enable '[[]]' attributes in all C and C++ language modes">;
 def fno_double_square_bracket_attributes : Flag<[ "-" ], "fno-double-square-bracket-attributes">,
   Group<f_Group>, Flags<[DriverOption, CC1Option]>,
   HelpText<"Disable '[[]]' attributes in all C and C++ language modes">;
 
 def fautolink : Flag <["-"], "fautolink">, Group<f_Group>;
 def fno_autolink : Flag <["-"], "fno-autolink">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>,
   HelpText<"Disable generation of linker directives for automatic library linking">;
 
 // C++ Coroutines TS
 def fcoroutines_ts : Flag <["-"], "fcoroutines-ts">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>,
   HelpText<"Enable support for the C++ Coroutines TS">;
 def fno_coroutines_ts : Flag <["-"], "fno-coroutines-ts">, Group<f_Group>,
   Flags<[DriverOption]>;
 
 def fembed_bitcode_EQ : Joined<["-"], "fembed-bitcode=">,
     Group<f_Group>, Flags<[DriverOption, CC1Option]>, MetaVarName<"<option>">,
     HelpText<"Embed LLVM bitcode (option: off, all, bitcode, marker)">;
 def fembed_bitcode : Flag<["-"], "fembed-bitcode">, Group<f_Group>,
   Alias<fembed_bitcode_EQ>, AliasArgs<["all"]>,
   HelpText<"Embed LLVM IR bitcode as data">;
 def fembed_bitcode_marker : Flag<["-"], "fembed-bitcode-marker">,
   Alias<fembed_bitcode_EQ>, AliasArgs<["marker"]>,
   HelpText<"Embed placeholder LLVM IR data as a marker">;
 def fgnu_inline_asm : Flag<["-"], "fgnu-inline-asm">, Group<f_Group>, Flags<[DriverOption]>;
 def fno_gnu_inline_asm : Flag<["-"], "fno-gnu-inline-asm">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>,
   HelpText<"Disable GNU style inline asm">;
 
 def fprofile_sample_use : Flag<["-"], "fprofile-sample-use">, Group<f_Group>,
     Flags<[CoreOption]>;
 def fno_profile_sample_use : Flag<["-"], "fno-profile-sample-use">, Group<f_Group>,
     Flags<[CoreOption]>;
 def fprofile_sample_use_EQ : Joined<["-"], "fprofile-sample-use=">,
     Group<f_Group>, Flags<[DriverOption, CC1Option]>,
     HelpText<"Enable sample-based profile guided optimizations">;
 def fprofile_sample_accurate : Flag<["-"], "fprofile-sample-accurate">,
     Group<f_Group>, Flags<[DriverOption, CC1Option]>,
     HelpText<"Specifies that the sample profile is accurate">,
     DocBrief<[{Specifies that the sample profile is accurate. If the sample
                profile is accurate, callsites without profile samples are marked
                as cold. Otherwise, treat callsites without profile samples as if
                we have no profile}]>;
 def fno_profile_sample_accurate : Flag<["-"], "fno-profile-sample-accurate">,
   Group<f_Group>, Flags<[DriverOption]>;
 def fauto_profile : Flag<["-"], "fauto-profile">, Group<f_Group>,
     Alias<fprofile_sample_use>;
 def fno_auto_profile : Flag<["-"], "fno-auto-profile">, Group<f_Group>,
     Alias<fno_profile_sample_use>;
 def fauto_profile_EQ : Joined<["-"], "fauto-profile=">,
     Alias<fprofile_sample_use_EQ>;
 def fauto_profile_accurate : Flag<["-"], "fauto-profile-accurate">,
     Group<f_Group>, Alias<fprofile_sample_accurate>;
 def fno_auto_profile_accurate : Flag<["-"], "fno-auto-profile-accurate">,
     Group<f_Group>, Alias<fno_profile_sample_accurate>;
 def fdebug_info_for_profiling : Flag<["-"], "fdebug-info-for-profiling">, Group<f_Group>,
     Flags<[CC1Option]>,
     HelpText<"Emit extra debug info to make sample profile more accurate.">;
 def fno_debug_info_for_profiling : Flag<["-"], "fno-debug-info-for-profiling">, Group<f_Group>,
     Flags<[DriverOption]>,
     HelpText<"Do not emit extra debug info for sample profiler.">;
 def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
     Group<f_Group>, Flags<[CoreOption]>,
     HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
 def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
     Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<file>">,
     HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
 def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
     Flags<[CoreOption]>;
 def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
     Group<f_Group>, Flags<[CoreOption]>,
     HelpText<"Use instrumentation data for profile-guided optimization">;
 def fcoverage_mapping : Flag<["-"], "fcoverage-mapping">,
     Group<f_Group>, Flags<[CC1Option]>,
     HelpText<"Generate coverage mapping to enable code coverage analysis">;
 def fno_coverage_mapping : Flag<["-"], "fno-coverage-mapping">,
     Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Disable code coverage analysis">;
 def fprofile_generate : Flag<["-"], "fprofile-generate">,
     Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
 def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">,
     Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
     HelpText<"Generate instrumented code to collect execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
 def fprofile_use : Flag<["-"], "fprofile-use">, Group<f_Group>,
     Alias<fprofile_instr_use>;
 def fprofile_use_EQ : Joined<["-"], "fprofile-use=">,
     Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<pathname>">,
     HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.">;
 def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">,
     Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Disable generation of profile instrumentation.">;
 def fno_profile_generate : Flag<["-"], "fno-profile-generate">,
     Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Disable generation of profile instrumentation.">;
 def fno_profile_instr_use : Flag<["-"], "fno-profile-instr-use">,
     Group<f_Group>, Flags<[DriverOption]>,
     HelpText<"Disable using instrumentation data for profile-guided optimization">;
 def fno_profile_use : Flag<["-"], "fno-profile-use">,
     Alias<fno_profile_instr_use>;
 
 def fblocks : Flag<["-"], "fblocks">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable the 'blocks' language feature">;
 def fbootclasspath_EQ : Joined<["-"], "fbootclasspath=">, Group<f_Group>;
 def fborland_extensions : Flag<["-"], "fborland-extensions">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Accept non-standard constructs supported by the Borland compiler">;
 def fbuiltin : Flag<["-"], "fbuiltin">, Group<f_Group>;
 def fbuiltin_module_map : Flag <["-"], "fbuiltin-module-map">, Group<f_Group>,
   Flags<[DriverOption]>, HelpText<"Load the clang builtins module map file.">;
 def fcaret_diagnostics : Flag<["-"], "fcaret-diagnostics">, Group<f_Group>;
 def fclang_abi_compat_EQ : Joined<["-"], "fclang-abi-compat=">, Group<f_clang_Group>,
   Flags<[CC1Option]>, MetaVarName<"<version>">, Values<"<major>.<minor>,latest">,
   HelpText<"Attempt to match the ABI of Clang <version>">;
 def fclasspath_EQ : Joined<["-"], "fclasspath=">, Group<f_Group>;
 def fcolor_diagnostics : Flag<["-"], "fcolor-diagnostics">, Group<f_Group>,
   Flags<[CoreOption, CC1Option]>, HelpText<"Use colors in diagnostics">;
 def fdiagnostics_color : Flag<["-"], "fdiagnostics-color">, Group<f_Group>,
   Flags<[CoreOption, DriverOption]>;
 def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<f_Group>;
 def fansi_escape_codes : Flag<["-"], "fansi-escape-codes">, Group<f_Group>,
   Flags<[CoreOption, CC1Option]>, HelpText<"Use ANSI escape codes for diagnostics">;
 def fcomment_block_commands : CommaJoined<["-"], "fcomment-block-commands=">, Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Treat each comma separated argument in <arg> as a documentation comment block command">,
   MetaVarName<"<arg>">;
 def fparse_all_comments : Flag<["-"], "fparse-all-comments">, Group<f_clang_Group>, Flags<[CC1Option]>;
 def fcommon : Flag<["-"], "fcommon">, Group<f_Group>;
 def fcompile_resource_EQ : Joined<["-"], "fcompile-resource=">, Group<f_Group>;
 def fconstant_cfstrings : Flag<["-"], "fconstant-cfstrings">, Group<f_Group>;
 def fconstant_string_class_EQ : Joined<["-"], "fconstant-string-class=">, Group<f_Group>;
 def fconstexpr_depth_EQ : Joined<["-"], "fconstexpr-depth=">, Group<f_Group>;
 def fconstexpr_steps_EQ : Joined<["-"], "fconstexpr-steps=">, Group<f_Group>;
 def fconstexpr_backtrace_limit_EQ : Joined<["-"], "fconstexpr-backtrace-limit=">,
                                     Group<f_Group>;
 def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group<f_clang_Group>, Flags<[NoArgumentUnused]>,
   HelpText<"Disable auto-generation of preprocessed source files and a script for reproduction during a clang crash">;
 def fcreate_profile : Flag<["-"], "fcreate-profile">, Group<f_Group>;
 def fcxx_exceptions: Flag<["-"], "fcxx-exceptions">, Group<f_Group>,
   HelpText<"Enable C++ exceptions">, Flags<[CC1Option]>;
 def fcxx_modules : Flag <["-"], "fcxx-modules">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fdebug_pass_arguments : Flag<["-"], "fdebug-pass-arguments">, Group<f_Group>;
 def fdebug_pass_structure : Flag<["-"], "fdebug-pass-structure">, Group<f_Group>;
 def fdepfile_entry : Joined<["-"], "fdepfile-entry=">,
     Group<f_clang_Group>, Flags<[CC1Option]>;
 def fdiagnostics_fixit_info : Flag<["-"], "fdiagnostics-fixit-info">, Group<f_clang_Group>;
 def fdiagnostics_parseable_fixits : Flag<["-"], "fdiagnostics-parseable-fixits">, Group<f_clang_Group>,
     Flags<[CoreOption, CC1Option]>, HelpText<"Print fix-its in machine parseable form">;
 def fdiagnostics_print_source_range_info : Flag<["-"], "fdiagnostics-print-source-range-info">,
     Group<f_clang_Group>,  Flags<[CC1Option]>,
     HelpText<"Print source range spans in numeric form">;
 def fdiagnostics_show_hotness : Flag<["-"], "fdiagnostics-show-hotness">, Group<f_Group>,
     Flags<[CC1Option]>, HelpText<"Enable profile hotness information in diagnostic line">;
 def fdiagnostics_hotness_threshold_EQ : Joined<["-"], "fdiagnostics-hotness-threshold=">,
     Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<number>">,
     HelpText<"Prevent optimization remarks from being output if they do not have at least this profile count">;
 def fdiagnostics_show_option : Flag<["-"], "fdiagnostics-show-option">, Group<f_Group>,
     Flags<[CC1Option]>, HelpText<"Print option name with mappable diagnostics">;
 def fdiagnostics_show_note_include_stack : Flag<["-"], "fdiagnostics-show-note-include-stack">,
     Group<f_Group>,  Flags<[CC1Option]>, HelpText<"Display include stacks for diagnostic notes">;
 def fdiagnostics_format_EQ : Joined<["-"], "fdiagnostics-format=">, Group<f_clang_Group>;
 def fdiagnostics_show_category_EQ : Joined<["-"], "fdiagnostics-show-category=">, Group<f_clang_Group>;
 def fdiagnostics_show_template_tree : Flag<["-"], "fdiagnostics-show-template-tree">,
     Group<f_Group>, Flags<[CC1Option]>,
     HelpText<"Print a template comparison tree for differing templates">;
 def fdeclspec : Flag<["-"], "fdeclspec">, Group<f_clang_Group>,
   HelpText<"Allow __declspec as a keyword">, Flags<[CC1Option]>;
 def fdollars_in_identifiers : Flag<["-"], "fdollars-in-identifiers">, Group<f_Group>,
   HelpText<"Allow '$' in identifiers">, Flags<[CC1Option]>;
 def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
 def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
 def fdwarf_directory_asm : Flag<["-"], "fdwarf-directory-asm">, Group<f_Group>;
 def fno_dwarf_directory_asm : Flag<["-"], "fno-dwarf-directory-asm">, Group<f_Group>, Flags<[CC1Option]>;
 def felide_constructors : Flag<["-"], "felide-constructors">, Group<f_Group>;
 def fno_elide_type : Flag<["-"], "fno-elide-type">, Group<f_Group>,
     Flags<[CC1Option]>,
     HelpText<"Do not elide types when printing diagnostics">;
 def feliminate_unused_debug_symbols : Flag<["-"], "feliminate-unused-debug-symbols">, Group<f_Group>;
 def femit_all_decls : Flag<["-"], "femit-all-decls">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Emit all declarations, even if unused">;
 def femulated_tls : Flag<["-"], "femulated-tls">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Use emutls functions to access thread_local variables">;
 def fno_emulated_tls : Flag<["-"], "fno-emulated-tls">, Group<f_Group>;
 def fencoding_EQ : Joined<["-"], "fencoding=">, Group<f_Group>;
 def ferror_limit_EQ : Joined<["-"], "ferror-limit=">, Group<f_Group>, Flags<[CoreOption]>;
 def fexceptions : Flag<["-"], "fexceptions">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable support for exception handling">;
 def fdwarf_exceptions : Flag<["-"], "fdwarf-exceptions">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Use DWARF style exceptions">;
 def fsjlj_exceptions : Flag<["-"], "fsjlj-exceptions">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Use SjLj style exceptions">;
 def fseh_exceptions : Flag<["-"], "fseh-exceptions">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Use SEH style exceptions">;
 def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 def : Flag<["-"], "fexpensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
 def : Flag<["-"], "fno-expensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
 def fextdirs_EQ : Joined<["-"], "fextdirs=">, Group<f_Group>;
 def : Flag<["-"], "fdefer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
 def : Flag<["-"], "fno-defer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
 def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
 def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
 def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
 def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
 def ffast_math : Flag<["-"], "ffast-math">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Allow aggressive, lossy floating-point optimizations">;
 def fno_fast_math : Flag<["-"], "fno-fast-math">, Group<f_Group>;
 def fmath_errno : Flag<["-"], "fmath-errno">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Require math functions to indicate errors by setting errno">;
 def fno_math_errno : Flag<["-"], "fno-math-errno">, Group<f_Group>;
 def fbracket_depth_EQ : Joined<["-"], "fbracket-depth=">, Group<f_Group>;
 def fsignaling_math : Flag<["-"], "fsignaling-math">, Group<f_Group>;
 def fno_signaling_math : Flag<["-"], "fno-signaling-math">, Group<f_Group>;
 def fjump_tables : Flag<["-"], "fjump-tables">, Group<f_Group>;
 def fno_jump_tables : Flag<["-"], "fno-jump-tables">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Do not use jump tables for lowering switches">;
 
 // Begin sanitizer flags. These should all be core options exposed in all driver
 // modes.
 let Flags = [CC1Option, CoreOption] in {
 
 def fsanitize_EQ : CommaJoined<["-"], "fsanitize=">, Group<f_clang_Group>,
                    MetaVarName<"<check>">,
                    HelpText<"Turn on runtime checks for various forms of undefined "
                             "or suspicious behavior. See user manual for available checks">;
 def fno_sanitize_EQ : CommaJoined<["-"], "fno-sanitize=">, Group<f_clang_Group>,
                       Flags<[CoreOption, DriverOption]>;
 def fsanitize_blacklist : Joined<["-"], "fsanitize-blacklist=">,
                           Group<f_clang_Group>,
                           HelpText<"Path to blacklist file for sanitizers">;
 def fno_sanitize_blacklist : Flag<["-"], "fno-sanitize-blacklist">,
                              Group<f_clang_Group>,
                              HelpText<"Don't use blacklist file for sanitizers">;
 def fsanitize_coverage
     : CommaJoined<["-"], "fsanitize-coverage=">,
       Group<f_clang_Group>,
       HelpText<"Specify the type of coverage instrumentation for Sanitizers">;
 def fno_sanitize_coverage
     : CommaJoined<["-"], "fno-sanitize-coverage=">,
       Group<f_clang_Group>, Flags<[CoreOption, DriverOption]>,
       HelpText<"Disable specified features of coverage instrumentation for "
                "Sanitizers">, Values<"func,bb,edge,indirect-calls,trace-bb,trace-cmp,trace-div,trace-gep,8bit-counters,trace-pc,trace-pc-guard,no-prune,inline-8bit-counters">;
 def fsanitize_memory_track_origins_EQ : Joined<["-"], "fsanitize-memory-track-origins=">,
                                         Group<f_clang_Group>,
                                         HelpText<"Enable origins tracking in MemorySanitizer">;
 def fsanitize_memory_track_origins : Flag<["-"], "fsanitize-memory-track-origins">,
                                      Group<f_clang_Group>,
                                      HelpText<"Enable origins tracking in MemorySanitizer">;
 def fno_sanitize_memory_track_origins : Flag<["-"], "fno-sanitize-memory-track-origins">,
                                         Group<f_clang_Group>,
                                         Flags<[CoreOption, DriverOption]>,
                                         HelpText<"Disable origins tracking in MemorySanitizer">;
 def fsanitize_memory_use_after_dtor : Flag<["-"], "fsanitize-memory-use-after-dtor">,
                                      Group<f_clang_Group>,
                                      HelpText<"Enable use-after-destroy detection in MemorySanitizer">;
 def fno_sanitize_memory_use_after_dtor : Flag<["-"], "fno-sanitize-memory-use-after-dtor">,
                                      Group<f_clang_Group>,
                                      HelpText<"Disable use-after-destroy detection in MemorySanitizer">;
 def fsanitize_address_field_padding : Joined<["-"], "fsanitize-address-field-padding=">,
                                         Group<f_clang_Group>,
                                         HelpText<"Level of field padding for AddressSanitizer">;
 def fsanitize_address_use_after_scope : Flag<["-"], "fsanitize-address-use-after-scope">,
                                         Group<f_clang_Group>,
                                         HelpText<"Enable use-after-scope detection in AddressSanitizer">;
 def fno_sanitize_address_use_after_scope : Flag<["-"], "fno-sanitize-address-use-after-scope">,
                                            Group<f_clang_Group>,
                                            Flags<[CoreOption, DriverOption]>,
                                            HelpText<"Disable use-after-scope detection in AddressSanitizer">;
 def fsanitize_address_globals_dead_stripping : Flag<["-"], "fsanitize-address-globals-dead-stripping">,
                                         Group<f_clang_Group>,
                                         HelpText<"Enable linker dead stripping of globals in AddressSanitizer">;
 def fsanitize_recover : Flag<["-"], "fsanitize-recover">, Group<f_clang_Group>;
 def fno_sanitize_recover : Flag<["-"], "fno-sanitize-recover">,
                            Flags<[CoreOption, DriverOption]>,
                            Group<f_clang_Group>;
 def fsanitize_recover_EQ : CommaJoined<["-"], "fsanitize-recover=">,
                            Group<f_clang_Group>,
                            HelpText<"Enable recovery for specified sanitizers">;
 def fno_sanitize_recover_EQ
     : CommaJoined<["-"], "fno-sanitize-recover=">,
       Group<f_clang_Group>,
       Flags<[CoreOption, DriverOption]>,
       HelpText<"Disable recovery for specified sanitizers">;
 def fsanitize_trap_EQ : CommaJoined<["-"], "fsanitize-trap=">, Group<f_clang_Group>,
                         HelpText<"Enable trapping for specified sanitizers">;
 def fno_sanitize_trap_EQ : CommaJoined<["-"], "fno-sanitize-trap=">, Group<f_clang_Group>,
                            Flags<[CoreOption, DriverOption]>,
                            HelpText<"Disable trapping for specified sanitizers">;
 def fsanitize_undefined_trap_on_error : Flag<["-"], "fsanitize-undefined-trap-on-error">,
                                         Group<f_clang_Group>;
 def fno_sanitize_undefined_trap_on_error : Flag<["-"], "fno-sanitize-undefined-trap-on-error">,
                                            Group<f_clang_Group>;
 def fsanitize_minimal_runtime : Flag<["-"], "fsanitize-minimal-runtime">,
                                         Group<f_clang_Group>;
 def fno_sanitize_minimal_runtime : Flag<["-"], "fno-sanitize-minimal-runtime">,
                                         Group<f_clang_Group>;
 def fsanitize_link_cxx_runtime : Flag<["-"], "fsanitize-link-c++-runtime">,
                                  Group<f_clang_Group>;
 def fsanitize_cfi_cross_dso : Flag<["-"], "fsanitize-cfi-cross-dso">,
                               Group<f_clang_Group>,
                               HelpText<"Enable control flow integrity (CFI) checks for cross-DSO calls.">;
 def fno_sanitize_cfi_cross_dso : Flag<["-"], "fno-sanitize-cfi-cross-dso">,
                                  Flags<[CoreOption, DriverOption]>,
                                  Group<f_clang_Group>,
                                  HelpText<"Disable control flow integrity (CFI) checks for cross-DSO calls.">;
 def fsanitize_cfi_icall_generalize_pointers : Flag<["-"], "fsanitize-cfi-icall-generalize-pointers">,
                                               Group<f_clang_Group>,
                                               HelpText<"Generalize pointers in CFI indirect call type signature checks">;
 def fsanitize_stats : Flag<["-"], "fsanitize-stats">,
                               Group<f_clang_Group>,
                               HelpText<"Enable sanitizer statistics gathering.">;
 def fno_sanitize_stats : Flag<["-"], "fno-sanitize-stats">,
                                  Group<f_clang_Group>,
                                  Flags<[CoreOption, DriverOption]>,
                                  HelpText<"Disable sanitizer statistics gathering.">;
 def fsanitize_thread_memory_access : Flag<["-"], "fsanitize-thread-memory-access">,
                                      Group<f_clang_Group>,
                                      HelpText<"Enable memory access instrumentation in ThreadSanitizer (default)">;
 def fno_sanitize_thread_memory_access : Flag<["-"], "fno-sanitize-thread-memory-access">,
                                         Group<f_clang_Group>,
                                         Flags<[CoreOption, DriverOption]>,
                                         HelpText<"Disable memory access instrumentation in ThreadSanitizer">;
 def fsanitize_thread_func_entry_exit : Flag<["-"], "fsanitize-thread-func-entry-exit">,
                                        Group<f_clang_Group>,
                                        HelpText<"Enable function entry/exit instrumentation in ThreadSanitizer (default)">;
 def fno_sanitize_thread_func_entry_exit : Flag<["-"], "fno-sanitize-thread-func-entry-exit">,
                                           Group<f_clang_Group>,
                                           Flags<[CoreOption, DriverOption]>,
                                           HelpText<"Disable function entry/exit instrumentation in ThreadSanitizer">;
 def fsanitize_thread_atomics : Flag<["-"], "fsanitize-thread-atomics">,
                                Group<f_clang_Group>,
                                HelpText<"Enable atomic operations instrumentation in ThreadSanitizer (default)">;
 def fno_sanitize_thread_atomics : Flag<["-"], "fno-sanitize-thread-atomics">,
                                   Group<f_clang_Group>,
                                   Flags<[CoreOption, DriverOption]>,
                                   HelpText<"Disable atomic operations instrumentation in ThreadSanitizer">;
 def fsanitize_undefined_strip_path_components_EQ : Joined<["-"], "fsanitize-undefined-strip-path-components=">,
   Group<f_clang_Group>, MetaVarName<"<number>">,
   HelpText<"Strip (or keep only, if negative) a given number of path components "
            "when emitting check metadata.">;
 
 } // end -f[no-]sanitize* flags
 
 def funsafe_math_optimizations : Flag<["-"], "funsafe-math-optimizations">,
   Group<f_Group>;
 def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">,
   Group<f_Group>;
 def fassociative_math : Flag<["-"], "fassociative-math">, Group<f_Group>;
 def fno_associative_math : Flag<["-"], "fno-associative-math">, Group<f_Group>;
 def freciprocal_math :
   Flag<["-"], "freciprocal-math">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Allow division operations to be reassociated">;
 def fno_reciprocal_math : Flag<["-"], "fno-reciprocal-math">, Group<f_Group>;
 def ffinite_math_only : Flag<["-"], "ffinite-math-only">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_finite_math_only : Flag<["-"], "fno-finite-math-only">, Group<f_Group>;
 def fsigned_zeros : Flag<["-"], "fsigned-zeros">, Group<f_Group>;
 def fno_signed_zeros :
   Flag<["-"], "fno-signed-zeros">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Allow optimizations that ignore the sign of floating point zeros">;
 def fhonor_nans : Flag<["-"], "fhonor-nans">, Group<f_Group>;
 def fno_honor_nans : Flag<["-"], "fno-honor-nans">, Group<f_Group>;
 def fhonor_infinities : Flag<["-"], "fhonor-infinities">, Group<f_Group>;
 def fno_honor_infinities : Flag<["-"], "fno-honor-infinities">, Group<f_Group>;
 // This option was originally misspelt "infinites" [sic].
 def : Flag<["-"], "fhonor-infinites">, Alias<fhonor_infinities>;
 def : Flag<["-"], "fno-honor-infinites">, Alias<fno_honor_infinities>;
 def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>, Flags<[CC1Option]>;
 def ffp_contract : Joined<["-"], "ffp-contract=">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
   " | on (according to FP_CONTRACT pragma, default) | off (never fuse)">, Values<"fast,on,off">;
 
 def ffor_scope : Flag<["-"], "ffor-scope">, Group<f_Group>;
 def fno_for_scope : Flag<["-"], "fno-for-scope">, Group<f_Group>;
 
 def frewrite_includes : Flag<["-"], "frewrite-includes">, Group<f_Group>,
   Flags<[CC1Option]>;
 def fno_rewrite_includes : Flag<["-"], "fno-rewrite-includes">, Group<f_Group>;
 
 def frewrite_imports : Flag<["-"], "frewrite-imports">, Group<f_Group>,
   Flags<[CC1Option]>;
 def fno_rewrite_imports : Flag<["-"], "fno-rewrite-imports">, Group<f_Group>;
 
 def frewrite_map_file : Separate<["-"], "frewrite-map-file">,
                         Group<f_Group>,
                         Flags<[ DriverOption, CC1Option ]>;
 def frewrite_map_file_EQ : Joined<["-"], "frewrite-map-file=">,
                            Group<f_Group>,
                            Flags<[DriverOption]>;
 
 def fuse_line_directives : Flag<["-"], "fuse-line-directives">, Group<f_Group>,
   Flags<[CC1Option]>;
 def fno_use_line_directives : Flag<["-"], "fno-use-line-directives">, Group<f_Group>;
 
 def ffreestanding : Flag<["-"], "ffreestanding">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Assert that the compilation takes place in a freestanding environment">;
 def fgnu_keywords : Flag<["-"], "fgnu-keywords">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Allow GNU-extension keywords regardless of language standard">;
 def fgnu89_inline : Flag<["-"], "fgnu89-inline">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Use the gnu89 inline semantics">;
 def fno_gnu89_inline : Flag<["-"], "fno-gnu89-inline">, Group<f_Group>;
 def fgnu_runtime : Flag<["-"], "fgnu-runtime">, Group<f_Group>,
   HelpText<"Generate output compatible with the standard GNU Objective-C runtime">;
 def fheinous_gnu_extensions : Flag<["-"], "fheinous-gnu-extensions">, Flags<[CC1Option]>;
 def filelist : Separate<["-"], "filelist">, Flags<[LinkerInput]>,
                Group<Link_Group>;
 def : Flag<["-"], "findirect-virtual-calls">, Alias<fapple_kext>;
 def finline_functions : Flag<["-"], "finline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Inline suitable functions">;
 def finline_hint_functions: Flag<["-"], "finline-hint-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Inline functions which are (explicitly or implicitly) marked inline">;
 def finline : Flag<["-"], "finline">, Group<clang_ignored_f_Group>;
+def fexperimental_isel : Flag<["-"], "fexperimental-isel">, Group<f_clang_Group>,
+  HelpText<"Enables the experimental global instruction selector">;
 def fexperimental_new_pass_manager : Flag<["-"], "fexperimental-new-pass-manager">,
   Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Enables an experimental new pass manager in LLVM.">;
 def finput_charset_EQ : Joined<["-"], "finput-charset=">, Group<f_Group>;
 def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>;
 def finstrument_functions : Flag<["-"], "finstrument-functions">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Generate calls to instrument function entry and exit">;
 def finstrument_functions_after_inlining : Flag<["-"], "finstrument-functions-after-inlining">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Like -finstrument-functions, but insert the calls after inlining">;
 def finstrument_function_entry_bare : Flag<["-"], "finstrument-function-entry-bare">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Instrument function entry only, after inlining, without arguments to the instrumentation call">;
 
 def fxray_instrument : Flag<["-"], "fxray-instrument">, Group<f_Group>,
   Flags<[CC1Option]>,
   HelpText<"Generate XRay instrumentation sleds on function entry and exit">;
 def fnoxray_instrument : Flag<["-"], "fno-xray-instrument">, Group<f_Group>,
   Flags<[CC1Option]>;
 
 def fxray_instruction_threshold_EQ :
   JoinedOrSeparate<["-"], "fxray-instruction-threshold=">,
   Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Sets the minimum function size to instrument with XRay">;
 def fxray_instruction_threshold_ :
   JoinedOrSeparate<["-"], "fxray-instruction-threshold">,
   Group<f_Group>, Flags<[CC1Option]>;
 
 def fxray_always_instrument :
   JoinedOrSeparate<["-"], "fxray-always-instrument=">,
   Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Filename defining the whitelist for imbuing the 'always instrument' XRay attribute.">;
 def fxray_never_instrument :
   JoinedOrSeparate<["-"], "fxray-never-instrument=">,
   Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Filename defining the whitelist for imbuing the 'never instrument' XRay attribute.">;
 
 def fxray_always_emit_customevents : Flag<["-"], "fxray-always-emit-customevents">, Group<f_Group>,
   Flags<[CC1Option]>,
   HelpText<"Determine whether to always emit __xray_customevent(...) calls even if the function it appears in is not always instrumented.">;
 def fnoxray_always_emit_customevents : Flag<["-"], "fno-xray-always-emit-customevents">, Group<f_Group>,
   Flags<[CC1Option]>;
 
 def ffine_grained_bitfield_accesses : Flag<["-"],
   "ffine-grained-bitfield-accesses">, Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Use separate accesses for bitfields with legal widths and alignments.">;
 def fno_fine_grained_bitfield_accesses : Flag<["-"],
   "fno-fine-grained-bitfield-accesses">, Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Use large-integer access for consecutive bitfield runs.">;
 
 def flat__namespace : Flag<["-"], "flat_namespace">;
 def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Group>;
 def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<f_Group>;
 def flto_EQ : Joined<["-"], "flto=">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
   HelpText<"Set LTO mode to either 'full' or 'thin'">, Values<"thin,full">;
 def flto : Flag<["-"], "flto">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
   HelpText<"Enable LTO in 'full' mode">;
 def fno_lto : Flag<["-"], "fno-lto">, Group<f_Group>,
   HelpText<"Disable LTO mode (default)">;
 def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
   Flags<[CC1Option]>, Group<f_Group>,
   HelpText<"Controls the backend parallelism of -flto=thin (default "
            "of 0 means the number of threads will be derived from "
            "the number of CPUs detected)">;
 def fthinlto_index_EQ : Joined<["-"], "fthinlto-index=">,
   Flags<[CC1Option]>, Group<f_Group>,
   HelpText<"Perform ThinLTO importing using provided function summary index">;
 def fmacro_backtrace_limit_EQ : Joined<["-"], "fmacro-backtrace-limit=">,
                                 Group<f_Group>, Flags<[DriverOption, CoreOption]>;
 def fmerge_all_constants : Flag<["-"], "fmerge-all-constants">, Group<f_Group>;
 def fmessage_length_EQ : Joined<["-"], "fmessage-length=">, Group<f_Group>;
 def fms_extensions : Flag<["-"], "fms-extensions">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
   HelpText<"Accept some non-standard constructs supported by the Microsoft compiler">;
 def fms_compatibility : Flag<["-"], "fms-compatibility">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
   HelpText<"Enable full Microsoft Visual C++ compatibility">;
 def fms_volatile : Joined<["-"], "fms-volatile">, Group<f_Group>, Flags<[CC1Option]>;
 def fmsc_version : Joined<["-"], "fmsc-version=">, Group<f_Group>, Flags<[DriverOption, CoreOption]>,
   HelpText<"Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))">;
 def fms_compatibility_version
     : Joined<["-"], "fms-compatibility-version=">,
       Group<f_Group>,
       Flags<[ CC1Option, CoreOption ]>,
       HelpText<"Dot-separated value representing the Microsoft compiler "
                "version number to report in _MSC_VER (0 = don't define it "
                "(default))">;
 def fdelayed_template_parsing : Flag<["-"], "fdelayed-template-parsing">, Group<f_Group>,
   HelpText<"Parse templated function definitions at the end of the "
            "translation unit">,  Flags<[CC1Option, CoreOption]>;
 def fms_memptr_rep_EQ : Joined<["-"], "fms-memptr-rep=">, Group<f_Group>, Flags<[CC1Option]>;
 def fmodules_cache_path : Joined<["-"], "fmodules-cache-path=">, Group<i_Group>,
   Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
   HelpText<"Specify the module cache path">;
 def fmodules_user_build_path : Separate<["-"], "fmodules-user-build-path">, Group<i_Group>,
   Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
   HelpText<"Specify the module user build path">;
 def fprebuilt_module_path : Joined<["-"], "fprebuilt-module-path=">, Group<i_Group>,
   Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
   HelpText<"Specify the prebuilt module path">;
 def fmodules_prune_interval : Joined<["-"], "fmodules-prune-interval=">, Group<i_Group>,
   Flags<[CC1Option]>, MetaVarName<"<seconds>">,
   HelpText<"Specify the interval (in seconds) between attempts to prune the module cache">;
 def fmodules_prune_after : Joined<["-"], "fmodules-prune-after=">, Group<i_Group>,
   Flags<[CC1Option]>, MetaVarName<"<seconds>">,
   HelpText<"Specify the interval (in seconds) after which a module file will be considered unused">;
 def fmodules_search_all : Flag <["-"], "fmodules-search-all">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>,
   HelpText<"Search even non-imported modules to resolve references">;
 def fbuild_session_timestamp : Joined<["-"], "fbuild-session-timestamp=">,
   Group<i_Group>, Flags<[CC1Option]>, MetaVarName<"<time since Epoch in seconds>">,
   HelpText<"Time when the current build session started">;
 def fbuild_session_file : Joined<["-"], "fbuild-session-file=">,
   Group<i_Group>, MetaVarName<"<file>">,
   HelpText<"Use the last modification time of <file> as the build session timestamp">;
 def fmodules_validate_once_per_build_session : Flag<["-"], "fmodules-validate-once-per-build-session">,
   Group<i_Group>, Flags<[CC1Option]>,
   HelpText<"Don't verify input files for the modules if the module has been "
            "successfully validated or loaded during this build session">;
 def fmodules_disable_diagnostic_validation : Flag<["-"], "fmodules-disable-diagnostic-validation">,
   Group<i_Group>, Flags<[CC1Option]>,
   HelpText<"Disable validation of the diagnostic options when loading the module">;
 def fmodules_validate_system_headers : Flag<["-"], "fmodules-validate-system-headers">,
   Group<i_Group>, Flags<[CC1Option]>,
   HelpText<"Validate the system headers that a module depends on when loading the module">;
 def fmodules : Flag <["-"], "fmodules">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>,
   HelpText<"Enable the 'modules' language feature">;
 def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>,
   HelpText<"Implicitly search the file system for module map files.">;
 def fmodules_ts : Flag <["-"], "fmodules-ts">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Enable support for the C++ Modules TS">;
 def fmodule_maps : Flag <["-"], "fmodule-maps">, Alias<fimplicit_module_maps>;
 def fmodule_name_EQ : Joined<["-"], "fmodule-name=">, Group<f_Group>,
   Flags<[DriverOption,CC1Option]>, MetaVarName<"<name>">,
   HelpText<"Specify the name of the module to build">;
 def fmodule_name : Separate<["-"], "fmodule-name">, Alias<fmodule_name_EQ>;
 def fmodule_implementation_of : Separate<["-"], "fmodule-implementation-of">,
   Flags<[CC1Option]>, Alias<fmodule_name_EQ>;
 def fmodule_map_file : Joined<["-"], "fmodule-map-file=">,
   Group<f_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"<file>">,
   HelpText<"Load this module map file">;
 def fmodule_file : Joined<["-"], "fmodule-file=">,
   Group<i_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"[<name>=]<file>">,
   HelpText<"Specify the mapping of module name to precompiled module file, or load a module file if name is omitted.">;
 def fmodules_ignore_macro : Joined<["-"], "fmodules-ignore-macro=">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Ignore the definition of the given macro when building and loading modules">;
 def fmodules_decluse : Flag <["-"], "fmodules-decluse">, Group<f_Group>,
   Flags<[DriverOption,CC1Option]>,
   HelpText<"Require declaration of modules used within a module">;
 def fmodules_strict_decluse : Flag <["-"], "fmodules-strict-decluse">, Group<f_Group>,
   Flags<[DriverOption,CC1Option]>,
   HelpText<"Like -fmodules-decluse but requires all headers to be in modules">;
 def fno_modules_search_all : Flag <["-"], "fno-modules-search-all">, Group<f_Group>,
   Flags<[DriverOption, CC1Option]>;
 def fno_implicit_modules :
   Flag <["-"], "fno-implicit-modules">,
   Group<f_Group>, Flags<[DriverOption, CC1Option]>;
 def fretain_comments_from_system_headers : Flag<["-"], "fretain-comments-from-system-headers">, Group<f_Group>, Flags<[CC1Option]>;
 
 def fmudflapth : Flag<["-"], "fmudflapth">, Group<f_Group>;
 def fmudflap : Flag<["-"], "fmudflap">, Group<f_Group>;
 def fnested_functions : Flag<["-"], "fnested-functions">, Group<f_Group>;
 def fnext_runtime : Flag<["-"], "fnext-runtime">, Group<f_Group>;
 def fno_access_control : Flag<["-"], "fno-access-control">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Disable C++ access control">;
 def fno_apple_pragma_pack : Flag<["-"], "fno-apple-pragma-pack">, Group<f_Group>;
 def fno_asm : Flag<["-"], "fno-asm">, Group<f_Group>;
 def fno_asynchronous_unwind_tables : Flag<["-"], "fno-asynchronous-unwind-tables">, Group<f_Group>;
 def fno_assume_sane_operator_new : Flag<["-"], "fno-assume-sane-operator-new">, Group<f_Group>,
   HelpText<"Don't assume that C++'s global operator new can't alias any pointer">,
   Flags<[CC1Option]>;
 def fno_blocks : Flag<["-"], "fno-blocks">, Group<f_Group>;
 def fno_borland_extensions : Flag<["-"], "fno-borland-extensions">, Group<f_Group>;
 def fno_builtin : Flag<["-"], "fno-builtin">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Disable implicit builtin knowledge of functions">;
 def fno_builtin_ : Joined<["-"], "fno-builtin-">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Disable implicit builtin knowledge of a specific function">;
 def fno_caret_diagnostics : Flag<["-"], "fno-caret-diagnostics">, Group<f_Group>,
  Flags<[CC1Option]>;
 def fno_color_diagnostics : Flag<["-"], "fno-color-diagnostics">, Group<f_Group>,
   Flags<[CoreOption, CC1Option]>;
 def fno_diagnostics_color : Flag<["-"], "fno-diagnostics-color">, Group<f_Group>,
   Flags<[CoreOption, DriverOption]>;
 def fno_common : Flag<["-"], "fno-common">, Group<f_Group>, Flags<[CC1Option]>,
     HelpText<"Compile common globals like normal definitions">;
 def fno_constant_cfstrings : Flag<["-"], "fno-constant-cfstrings">, Group<f_Group>,
   Flags<[CC1Option]>,
   HelpText<"Disable creation of CodeFoundation-type constant strings">;
 def fno_cxx_exceptions: Flag<["-"], "fno-cxx-exceptions">, Group<f_Group>;
 def fno_cxx_modules : Flag <["-"], "fno-cxx-modules">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fno_diagnostics_fixit_info : Flag<["-"], "fno-diagnostics-fixit-info">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Do not include fixit information in diagnostics">;
 def fno_diagnostics_show_hotness : Flag<["-"], "fno-diagnostics-show-hotness">, Group<f_Group>;
 def fno_diagnostics_show_option : Flag<["-"], "fno-diagnostics-show-option">, Group<f_Group>;
 def fno_diagnostics_show_note_include_stack : Flag<["-"], "fno-diagnostics-show-note-include-stack">,
     Flags<[CC1Option]>, Group<f_Group>;
 def fno_declspec : Flag<["-"], "fno-declspec">, Group<f_clang_Group>,
   HelpText<"Disallow __declspec as a keyword">, Flags<[CC1Option]>;
 def fno_dollars_in_identifiers : Flag<["-"], "fno-dollars-in-identifiers">, Group<f_Group>,
   HelpText<"Disallow '$' in identifiers">, Flags<[CC1Option]>;
 def fno_elide_constructors : Flag<["-"], "fno-elide-constructors">, Group<f_Group>,
   HelpText<"Disable C++ copy constructor elision">, Flags<[CC1Option]>;
 def fno_eliminate_unused_debug_symbols : Flag<["-"], "fno-eliminate-unused-debug-symbols">, Group<f_Group>;
 def fno_exceptions : Flag<["-"], "fno-exceptions">, Group<f_Group>;
 def fno_gnu_keywords : Flag<["-"], "fno-gnu-keywords">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_inline_functions : Flag<["-"], "fno-inline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>;
 def fno_inline : Flag<["-"], "fno-inline">, Group<f_clang_Group>, Flags<[CC1Option]>;
+def fno_experimental_isel : Flag<["-"], "fno-experimental-isel">, Group<f_clang_Group>,
+  HelpText<"Disables the experimental global instruction selector">;
 def fno_experimental_new_pass_manager : Flag<["-"], "fno-experimental-new-pass-manager">,
   Group<f_clang_Group>, Flags<[CC1Option]>,
   HelpText<"Disables an experimental new pass manager in LLVM.">;
 def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>, Flags<[CC1Option]>,
     HelpText<"Use the given vector functions library">, Values<"Accelerate,SVML,none">;
 def fno_lax_vector_conversions : Flag<["-"], "fno-lax-vector-conversions">, Group<f_Group>,
   HelpText<"Disallow implicit conversions between vectors with a different number of elements or different element types">, Flags<[CC1Option]>;
 def fno_merge_all_constants : Flag<["-"], "fno-merge-all-constants">, Group<f_Group>,
     Flags<[CC1Option]>, HelpText<"Disallow merging of constants">;
 def fno_modules : Flag <["-"], "fno-modules">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fno_implicit_module_maps : Flag <["-"], "fno-implicit-module-maps">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fno_module_maps : Flag <["-"], "fno-module-maps">, Alias<fno_implicit_module_maps>;
 def fno_modules_decluse : Flag <["-"], "fno-modules-decluse">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fno_modules_strict_decluse : Flag <["-"], "fno-strict-modules-decluse">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fimplicit_modules : Flag <["-"], "fimplicit-modules">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fmodule_file_deps : Flag <["-"], "fmodule-file-deps">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fno_module_file_deps : Flag <["-"], "fno-module-file-deps">, Group<f_Group>,
   Flags<[DriverOption]>;
 def fno_ms_extensions : Flag<["-"], "fno-ms-extensions">, Group<f_Group>,
   Flags<[CoreOption]>;
 def fno_ms_compatibility : Flag<["-"], "fno-ms-compatibility">, Group<f_Group>,
   Flags<[CoreOption]>;
 def fno_delayed_template_parsing : Flag<["-"], "fno-delayed-template-parsing">, Group<f_Group>,
   HelpText<"Disable delayed template parsing">,
   Flags<[DriverOption, CoreOption]>;
 def fno_objc_exceptions: Flag<["-"], "fno-objc-exceptions">, Group<f_Group>;
 def fno_objc_legacy_dispatch : Flag<["-"], "fno-objc-legacy-dispatch">, Group<f_Group>;
 def fno_objc_weak : Flag<["-"], "fno-objc-weak">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_omit_frame_pointer : Flag<["-"], "fno-omit-frame-pointer">, Group<f_Group>;
 def fno_operator_names : Flag<["-"], "fno-operator-names">, Group<f_Group>,
   HelpText<"Do not treat C++ operator name keywords as synonyms for operators">,
   Flags<[CC1Option]>;
 def fno_pascal_strings : Flag<["-"], "fno-pascal-strings">, Group<f_Group>;
 def fno_rtti : Flag<["-"], "fno-rtti">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Disable generation of rtti information">;
 def fno_short_enums : Flag<["-"], "fno-short-enums">, Group<f_Group>;
 def fno_show_column : Flag<["-"], "fno-show-column">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Do not include column number on diagnostics">;
 def fno_show_source_location : Flag<["-"], "fno-show-source-location">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Do not include source location information with diagnostics">;
 def fdiagnostics_absolute_paths : Flag<["-"], "fdiagnostics-absolute-paths">, Group<f_Group>,
   Flags<[CC1Option, CoreOption]>, HelpText<"Print absolute paths in diagnostics">;
 def fno_spell_checking : Flag<["-"], "fno-spell-checking">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Disable spell-checking">;
 def fno_stack_protector : Flag<["-"], "fno-stack-protector">, Group<f_Group>,
   HelpText<"Disable the use of stack protectors">;
 def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group<f_Group>,
   Flags<[DriverOption, CoreOption]>;
 def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group<f_Group>;
 def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
 def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
 def fno_strict_vtable_pointers: Flag<["-"], "fno-strict-vtable-pointers">,
   Group<f_Group>;
 def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>;
 def fno_threadsafe_statics : Flag<["-"], "fno-threadsafe-statics">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Do not emit code to make initialization of local statics thread safe">;
 def fno_use_cxa_atexit : Flag<["-"], "fno-use-cxa-atexit">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Don't use __cxa_atexit for calling destructors">;
 def fno_use_init_array : Flag<["-"], "fno-use-init-array">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Don't use .init_array instead of .ctors">;
 def fno_unit_at_a_time : Flag<["-"], "fno-unit-at-a-time">, Group<f_Group>;
 def fno_unwind_tables : Flag<["-"], "fno-unwind-tables">, Group<f_Group>;
 def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>;
 def fno_working_directory : Flag<["-"], "fno-working-directory">, Group<f_Group>;
 def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>;
 def fno_zero_initialized_in_bss : Flag<["-"], "fno-zero-initialized-in-bss">, Group<f_Group>;
 def fobjc_arc : Flag<["-"], "fobjc-arc">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Synthesize retain and release calls for Objective-C pointers">;
 def fno_objc_arc : Flag<["-"], "fno-objc-arc">, Group<f_Group>;
 def fobjc_arc_exceptions : Flag<["-"], "fobjc-arc-exceptions">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Use EH-safe code when synthesizing retains and releases in -fobjc-arc">;
 def fno_objc_arc_exceptions : Flag<["-"], "fno-objc-arc-exceptions">, Group<f_Group>;
 def fobjc_atdefs : Flag<["-"], "fobjc-atdefs">, Group<clang_ignored_f_Group>;
 def fobjc_call_cxx_cdtors : Flag<["-"], "fobjc-call-cxx-cdtors">, Group<clang_ignored_f_Group>;
 def fobjc_exceptions: Flag<["-"], "fobjc-exceptions">, Group<f_Group>,
   HelpText<"Enable Objective-C exceptions">, Flags<[CC1Option]>;
 def fapplication_extension : Flag<["-"], "fapplication-extension">,
   Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Restrict code to those available for App Extensions">;
 def fno_application_extension : Flag<["-"], "fno-application-extension">,
   Group<f_Group>;
 def frelaxed_template_template_args : Flag<["-"], "frelaxed-template-template-args">,
   Flags<[CC1Option]>, HelpText<"Enable C++17 relaxed template template argument matching">,
   Group<f_Group>;
 def fno_relaxed_template_template_args : Flag<["-"], "fno-relaxed-template-template-args">,
   Group<f_Group>;
 def fsized_deallocation : Flag<["-"], "fsized-deallocation">, Flags<[CC1Option]>,
   HelpText<"Enable C++14 sized global deallocation functions">, Group<f_Group>;
 def fno_sized_deallocation: Flag<["-"], "fno-sized-deallocation">, Group<f_Group>;
 def faligned_allocation : Flag<["-"], "faligned-allocation">, Flags<[CC1Option]>,
   HelpText<"Enable C++17 aligned allocation functions">, Group<f_Group>;
 def fno_aligned_allocation: Flag<["-"], "fno-aligned-allocation">,
   Group<f_Group>, Flags<[CC1Option]>;
 def fnew_alignment_EQ : Joined<["-"], "fnew-alignment=">,
   HelpText<"Specifies the largest alignment guaranteed by '::operator new(size_t)'">,
   MetaVarName<"<align>">, Group<f_Group>, Flags<[CC1Option]>;
 def : Separate<["-"], "fnew-alignment">, Alias<fnew_alignment_EQ>;
 def : Flag<["-"], "faligned-new">, Alias<faligned_allocation>;
 def : Flag<["-"], "fno-aligned-new">, Alias<fno_aligned_allocation>;
 def faligned_new_EQ : Joined<["-"], "faligned-new=">;
 
 def fobjc_legacy_dispatch : Flag<["-"], "fobjc-legacy-dispatch">, Group<f_Group>;
 def fobjc_new_property : Flag<["-"], "fobjc-new-property">, Group<clang_ignored_f_Group>;
 def fobjc_infer_related_result_type : Flag<["-"], "fobjc-infer-related-result-type">,
                                       Group<f_Group>;
 def fno_objc_infer_related_result_type : Flag<["-"],
   "fno-objc-infer-related-result-type">, Group<f_Group>,
   HelpText<
     "do not infer Objective-C related result type based on method family">,
   Flags<[CC1Option]>;
 def fobjc_link_runtime: Flag<["-"], "fobjc-link-runtime">, Group<f_Group>;
 def fobjc_weak : Flag<["-"], "fobjc-weak">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable ARC-style weak references in Objective-C">;
 
 // Objective-C ABI options.
 def fobjc_runtime_EQ : Joined<["-"], "fobjc-runtime=">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Specify the target Objective-C runtime kind and version">;
 def fobjc_abi_version_EQ : Joined<["-"], "fobjc-abi-version=">, Group<f_Group>;
 def fobjc_nonfragile_abi_version_EQ : Joined<["-"], "fobjc-nonfragile-abi-version=">, Group<f_Group>;
 def fobjc_nonfragile_abi : Flag<["-"], "fobjc-nonfragile-abi">, Group<f_Group>;
 def fno_objc_nonfragile_abi : Flag<["-"], "fno-objc-nonfragile-abi">, Group<f_Group>;
 
 def fobjc_sender_dependent_dispatch : Flag<["-"], "fobjc-sender-dependent-dispatch">, Group<f_Group>;
 def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>;
 def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
 def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>, Flags<[NoArgumentUnused]>;
 def fopenmp_version_EQ : Joined<["-"], "fopenmp-version=">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
 def fopenmp_EQ : Joined<["-"], "fopenmp=">, Group<f_Group>;
 def fopenmp_use_tls : Flag<["-"], "fopenmp-use-tls">, Group<f_Group>, Flags<[NoArgumentUnused]>;
 def fnoopenmp_use_tls : Flag<["-"], "fnoopenmp-use-tls">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
 def fopenmp_targets_EQ : CommaJoined<["-"], "fopenmp-targets=">, Flags<[DriverOption, CC1Option]>,
   HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
 def fopenmp_dump_offload_linker_script : Flag<["-"], "fopenmp-dump-offload-linker-script">, Group<f_Group>,
   Flags<[NoArgumentUnused]>;
 def fopenmp_relocatable_target : Flag<["-"], "fopenmp-relocatable-target">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
   HelpText<"OpenMP target code is compiled as relocatable using the -c flag. For OpenMP targets the code is relocatable by default.">;
 def fnoopenmp_relocatable_target : Flag<["-"], "fnoopenmp-relocatable-target">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
   HelpText<"Do not compile OpenMP target code as relocatable.">;
 def fopenmp_simd : Flag<["-"], "fopenmp-simd">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
   HelpText<"Emit OpenMP code only for SIMD-based constructs.">;
 def fno_openmp_simd : Flag<["-"], "fno-openmp-simd">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>,
   HelpText<"Disable OpenMP code for SIMD-based constructs.">;
 def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
 def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
 def force__cpusubtype__ALL : Flag<["-"], "force_cpusubtype_ALL">;
 def force__flat__namespace : Flag<["-"], "force_flat_namespace">;
 def force__load : Separate<["-"], "force_load">;
 def force_addr : Joined<["-"], "fforce-addr">, Group<clang_ignored_f_Group>;
 def foutput_class_dir_EQ : Joined<["-"], "foutput-class-dir=">, Group<f_Group>;
 def fpack_struct : Flag<["-"], "fpack-struct">, Group<f_Group>;
 def fno_pack_struct : Flag<["-"], "fno-pack-struct">, Group<f_Group>;
 def fpack_struct_EQ : Joined<["-"], "fpack-struct=">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Specify the default maximum struct packing alignment">;
 def fmax_type_align_EQ : Joined<["-"], "fmax-type-align=">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Specify the maximum alignment to enforce on pointers lacking an explicit alignment">;
 def fno_max_type_align : Flag<["-"], "fno-max-type-align">, Group<f_Group>;
 def fpascal_strings : Flag<["-"], "fpascal-strings">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Recognize and construct Pascal-style string literals">;
 def fpcc_struct_return : Flag<["-"], "fpcc-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Override the default ABI to return all structs on the stack">;
 def fpch_preprocess : Flag<["-"], "fpch-preprocess">, Group<f_Group>;
 def fpic : Flag<["-"], "fpic">, Group<f_Group>;
 def fno_pic : Flag<["-"], "fno-pic">, Group<f_Group>;
 def fpie : Flag<["-"], "fpie">, Group<f_Group>;
 def fno_pie : Flag<["-"], "fno-pie">, Group<f_Group>;
 def fplt : Flag<["-"], "fplt">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Use the PLT to make function calls">;
 def fno_plt : Flag<["-"], "fno-plt">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Do not use the PLT to make function calls">;
 def fropi : Flag<["-"], "fropi">, Group<f_Group>;
 def fno_ropi : Flag<["-"], "fno-ropi">, Group<f_Group>;
 def frwpi : Flag<["-"], "frwpi">, Group<f_Group>;
 def fno_rwpi : Flag<["-"], "fno-rwpi">, Group<f_Group>;
 def fplugin_EQ : Joined<["-"], "fplugin=">, Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<dsopath>">,
   HelpText<"Load the named plugin (dynamic shared object)">;
 def fpreserve_as_comments : Flag<["-"], "fpreserve-as-comments">, Group<f_Group>;
 def fno_preserve_as_comments : Flag<["-"], "fno-preserve-as-comments">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Do not preserve comments in inline assembly">;
 def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group<f_Group>;
 def fno_profile_arcs : Flag<["-"], "fno-profile-arcs">, Group<f_Group>;
 def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>;
 def frandom_seed_EQ : Joined<["-"], "frandom-seed=">, Group<clang_ignored_f_Group>;
 def freg_struct_return : Flag<["-"], "freg-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Override the default ABI to return small structs in registers">;
 def frtti : Flag<["-"], "frtti">, Group<f_Group>;
 def : Flag<["-"], "fsched-interblock">, Group<clang_ignored_f_Group>;
 def fshort_enums : Flag<["-"], "fshort-enums">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Allocate to an enum type only as many bytes as it needs for the declared range of possible values">;
 def fshort_wchar : Flag<["-"], "fshort-wchar">, Group<f_Group>,
   HelpText<"Force wchar_t to be a short unsigned int">;
 def fno_short_wchar : Flag<["-"], "fno-short-wchar">, Group<f_Group>,
   HelpText<"Force wchar_t to be an unsigned int">;
 def fshow_overloads_EQ : Joined<["-"], "fshow-overloads=">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Which overload candidates to show when overload resolution fails: "
            "best|all; defaults to all">, Values<"best,all">;
 def fshow_column : Flag<["-"], "fshow-column">, Group<f_Group>, Flags<[CC1Option]>;
 def fshow_source_location : Flag<["-"], "fshow-source-location">, Group<f_Group>;
 def fspell_checking : Flag<["-"], "fspell-checking">, Group<f_Group>;
 def fspell_checking_limit_EQ : Joined<["-"], "fspell-checking-limit=">, Group<f_Group>;
 def fsigned_bitfields : Flag<["-"], "fsigned-bitfields">, Group<f_Group>;
 def fsigned_char : Flag<["-"], "fsigned-char">, Group<f_Group>;
 def fno_signed_char : Flag<["-"], "fno-signed-char">, Group<f_Group>,
     Flags<[CC1Option]>, HelpText<"Char is unsigned">;
 def fsplit_stack : Flag<["-"], "fsplit-stack">, Group<f_Group>;
 def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group<f_Group>,
   HelpText<"Force the usage of stack protectors for all functions">;
 def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group<f_Group>,
   HelpText<"Use a strong heuristic to apply stack protectors to functions">;
 def fstack_protector : Flag<["-"], "fstack-protector">, Group<f_Group>,
   HelpText<"Enable stack protectors for functions potentially vulnerable to stack smashing">;
 def fstandalone_debug : Flag<["-"], "fstandalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
   HelpText<"Emit full debug info for all types used by the program">;
 def fno_standalone_debug : Flag<["-"], "fno-standalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
   HelpText<"Limit debug information produced to reduce size of debug binary">;
 def flimit_debug_info : Flag<["-"], "flimit-debug-info">, Flags<[CoreOption]>, Alias<fno_standalone_debug>;
 def fno_limit_debug_info : Flag<["-"], "fno-limit-debug-info">, Flags<[CoreOption]>, Alias<fstandalone_debug>;
 def fdebug_macro : Flag<["-"], "fdebug-macro">, Group<f_Group>, Flags<[CoreOption]>,
   HelpText<"Emit macro debug information">;
 def fno_debug_macro : Flag<["-"], "fno-debug-macro">, Group<f_Group>, Flags<[CoreOption]>,
   HelpText<"Do not emit macro debug information">;
 def fstrict_aliasing : Flag<["-"], "fstrict-aliasing">, Group<f_Group>,
   Flags<[DriverOption, CoreOption]>;
 def fstrict_enums : Flag<["-"], "fstrict-enums">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable optimizations based on the strict definition of an enum's "
            "value range">;
 def fstrict_vtable_pointers: Flag<["-"], "fstrict-vtable-pointers">,
   Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable optimizations based on the strict rules for overwriting "
              "polymorphic C++ objects">;
 def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group<f_Group>;
 def fsyntax_only : Flag<["-"], "fsyntax-only">,
   Flags<[DriverOption,CoreOption,CC1Option]>, Group<Action_Group>;
 def ftabstop_EQ : Joined<["-"], "ftabstop=">, Group<f_Group>;
 def ftemplate_depth_EQ : Joined<["-"], "ftemplate-depth=">, Group<f_Group>;
 def ftemplate_depth_ : Joined<["-"], "ftemplate-depth-">, Group<f_Group>;
 def ftemplate_backtrace_limit_EQ : Joined<["-"], "ftemplate-backtrace-limit=">,
                                    Group<f_Group>;
 def foperator_arrow_depth_EQ : Joined<["-"], "foperator-arrow-depth=">,
                                Group<f_Group>;
 
 def fsave_optimization_record : Flag<["-"], "fsave-optimization-record">,
   Group<f_Group>, HelpText<"Generate a YAML optimization record file">;
 def fno_save_optimization_record : Flag<["-"], "fno-save-optimization-record">,
   Group<f_Group>, Flags<[NoArgumentUnused]>;
 def foptimization_record_file_EQ : Joined<["-"], "foptimization-record-file=">,
   Group<f_Group>,
   HelpText<"Specify the file name of any generated YAML optimization record">;
 
 def ftest_coverage : Flag<["-"], "ftest-coverage">, Group<f_Group>;
 def fvectorize : Flag<["-"], "fvectorize">, Group<f_Group>,
   HelpText<"Enable the loop vectorization passes">;
 def fno_vectorize : Flag<["-"], "fno-vectorize">, Group<f_Group>;
 def : Flag<["-"], "ftree-vectorize">, Alias<fvectorize>;
 def : Flag<["-"], "fno-tree-vectorize">, Alias<fno_vectorize>;
 def fslp_vectorize : Flag<["-"], "fslp-vectorize">, Group<f_Group>,
   HelpText<"Enable the superword-level parallelism vectorization passes">;
 def fno_slp_vectorize : Flag<["-"], "fno-slp-vectorize">, Group<f_Group>;
 def : Flag<["-"], "ftree-slp-vectorize">, Alias<fslp_vectorize>;
 def : Flag<["-"], "fno-tree-slp-vectorize">, Alias<fno_slp_vectorize>;
 def Wlarge_by_value_copy_def : Flag<["-"], "Wlarge-by-value-copy">,
   HelpText<"Warn if a function definition returns or accepts an object larger "
            "in bytes than a given value">, Flags<[HelpHidden]>;
 def Wlarge_by_value_copy_EQ : Joined<["-"], "Wlarge-by-value-copy=">, Flags<[CC1Option]>;
 
 // These "special" warning flags are effectively processed as f_Group flags by the driver:
 // Just silence warnings about -Wlarger-than for now.
 def Wlarger_than_EQ : Joined<["-"], "Wlarger-than=">, Group<clang_ignored_f_Group>;
 def Wlarger_than_ : Joined<["-"], "Wlarger-than-">, Alias<Wlarger_than_EQ>;
 def Wframe_larger_than_EQ : Joined<["-"], "Wframe-larger-than=">, Group<f_Group>, Flags<[DriverOption]>;
 
 def : Flag<["-"], "fterminated-vtables">, Alias<fapple_kext>;
 def fthreadsafe_statics : Flag<["-"], "fthreadsafe-statics">, Group<f_Group>;
 def ftime_report : Flag<["-"], "ftime-report">, Group<f_Group>, Flags<[CC1Option]>;
 def ftlsmodel_EQ : Joined<["-"], "ftls-model=">, Group<f_Group>, Flags<[CC1Option]>;
 def ftrapv : Flag<["-"], "ftrapv">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Trap on integer overflow">;
 def ftrapv_handler_EQ : Joined<["-"], "ftrapv-handler=">, Group<f_Group>,
   MetaVarName<"<function name>">,
   HelpText<"Specify the function to be called on overflow">;
 def ftrapv_handler : Separate<["-"], "ftrapv-handler">, Group<f_Group>, Flags<[CC1Option]>;
 def ftrap_function_EQ : Joined<["-"], "ftrap-function=">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Issue call to specified function rather than a trap instruction">;
 def funit_at_a_time : Flag<["-"], "funit-at-a-time">, Group<f_Group>;
 def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
   HelpText<"Turn on loop unroller">, Flags<[CC1Option]>;
 def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
   HelpText<"Turn off loop unroller">, Flags<[CC1Option]>;
 def freroll_loops : Flag<["-"], "freroll-loops">, Group<f_Group>,
   HelpText<"Turn on loop reroller">, Flags<[CC1Option]>;
 def fno_reroll_loops : Flag<["-"], "fno-reroll-loops">, Group<f_Group>,
   HelpText<"Turn off loop reroller">;
 def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
   HelpText<"Process trigraph sequences">, Flags<[CC1Option]>;
 def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
   HelpText<"Do not process trigraph sequences">, Flags<[CC1Option]>;
 def funsigned_bitfields : Flag<["-"], "funsigned-bitfields">, Group<f_Group>;
 def funsigned_char : Flag<["-"], "funsigned-char">, Group<f_Group>;
 def fno_unsigned_char : Flag<["-"], "fno-unsigned-char">;
 def funwind_tables : Flag<["-"], "funwind-tables">, Group<f_Group>;
 def fuse_cxa_atexit : Flag<["-"], "fuse-cxa-atexit">, Group<f_Group>;
 def fuse_init_array : Flag<["-"], "fuse-init-array">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Use .init_array instead of .ctors">;
 def fno_var_tracking : Flag<["-"], "fno-var-tracking">, Group<clang_ignored_f_Group>;
 def fverbose_asm : Flag<["-"], "fverbose-asm">, Group<f_Group>;
 def fvisibility_EQ : Joined<["-"], "fvisibility=">, Group<f_Group>,
   HelpText<"Set the default symbol visibility for all global declarations">, Values<"hidden,default">;
 def fvisibility_inlines_hidden : Flag<["-"], "fvisibility-inlines-hidden">, Group<f_Group>,
   HelpText<"Give inline C++ member functions default visibility by default">,
   Flags<[CC1Option]>;
 def fvisibility_ms_compat : Flag<["-"], "fvisibility-ms-compat">, Group<f_Group>,
   HelpText<"Give global types 'default' visibility and global functions and "
            "variables 'hidden' visibility by default">;
 def fwhole_program_vtables : Flag<["-"], "fwhole-program-vtables">, Group<f_Group>,
   Flags<[CoreOption, CC1Option]>,
   HelpText<"Enables whole-program vtable optimization. Requires -flto">;
 def fno_whole_program_vtables : Flag<["-"], "fno-whole-program-vtables">, Group<f_Group>,
   Flags<[CoreOption]>;
 def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Treat signed integer overflow as two's complement">;
 def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Store string literals as writable data">;
 def fzero_initialized_in_bss : Flag<["-"], "fzero-initialized-in-bss">, Group<f_Group>;
 def ffunction_sections : Flag<["-"], "ffunction-sections">, Group<f_Group>,
   Flags<[CC1Option]>,
   HelpText<"Place each function in its own section (ELF Only)">;
 def fno_function_sections : Flag<["-"], "fno-function-sections">,
   Group<f_Group>, Flags<[CC1Option]>;
 def fdata_sections : Flag <["-"], "fdata-sections">, Group<f_Group>,
  Flags<[CC1Option]>, HelpText<"Place each data in its own section (ELF Only)">;
 def fno_data_sections : Flag <["-"], "fno-data-sections">, Group<f_Group>,
   Flags<[CC1Option]>;
 
 def funique_section_names : Flag <["-"], "funique-section-names">,
   Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Use unique names for text and data sections (ELF Only)">;
 def fno_unique_section_names : Flag <["-"], "fno-unique-section-names">,
   Group<f_Group>, Flags<[CC1Option]>;
 
 def fstrict_return : Flag<["-"], "fstrict-return">, Group<f_Group>,
   Flags<[CC1Option]>,
   HelpText<"Always treat control flow paths that fall off the end of a "
            "non-void function as unreachable">;
 def fno_strict_return : Flag<["-"], "fno-strict-return">, Group<f_Group>,
   Flags<[CC1Option]>;
 
 def fallow_editor_placeholders : Flag<["-"], "fallow-editor-placeholders">,
   Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Treat editor placeholders as valid source code">;
 def fno_allow_editor_placeholders : Flag<["-"],
   "fno-allow-editor-placeholders">, Group<f_Group>;
 
 def fdebug_types_section: Flag <["-"], "fdebug-types-section">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Place debug types in their own section (ELF Only)">;
 def fno_debug_types_section: Flag<["-"], "fno-debug-types-section">, Group<f_Group>,
   Flags<[CC1Option]>;
 def fsplit_dwarf_inlining: Flag <["-"], "fsplit-dwarf-inlining">, Group<f_Group>,
   Flags<[CC1Option]>, HelpText<"Place debug types in their own section (ELF Only)">;
 def fno_split_dwarf_inlining: Flag<["-"], "fno-split-dwarf-inlining">, Group<f_Group>,
   Flags<[CC1Option]>;
 def fdebug_prefix_map_EQ
   : Joined<["-"], "fdebug-prefix-map=">, Group<f_Group>, Flags<[CC1Option]>,
     HelpText<"remap file source paths in debug info">;
 def g_Flag : Flag<["-"], "g">, Group<g_Group>,
   HelpText<"Generate source-level debug information">;
 def gline_tables_only : Flag<["-"], "gline-tables-only">, Group<gN_Group>,
   Flags<[CoreOption]>, HelpText<"Emit debug line number tables only">;
 def gmlt : Flag<["-"], "gmlt">, Alias<gline_tables_only>;
 def g0 : Flag<["-"], "g0">, Group<gN_Group>;
 def g1 : Flag<["-"], "g1">, Group<gN_Group>, Alias<gline_tables_only>;
 def g2 : Flag<["-"], "g2">, Group<gN_Group>;
 def g3 : Flag<["-"], "g3">, Group<gN_Group>;
 def ggdb : Flag<["-"], "ggdb">, Group<gTune_Group>;
 def ggdb0 : Flag<["-"], "ggdb0">, Group<ggdbN_Group>;
 def ggdb1 : Flag<["-"], "ggdb1">, Group<ggdbN_Group>;
 def ggdb2 : Flag<["-"], "ggdb2">, Group<ggdbN_Group>;
 def ggdb3 : Flag<["-"], "ggdb3">, Group<ggdbN_Group>;
 def glldb : Flag<["-"], "glldb">, Group<gTune_Group>;
 def gsce : Flag<["-"], "gsce">, Group<gTune_Group>;
 def gdwarf_2 : Flag<["-"], "gdwarf-2">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 2">;
 def gdwarf_3 : Flag<["-"], "gdwarf-3">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 3">;
 def gdwarf_4 : Flag<["-"], "gdwarf-4">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 4">;
 def gdwarf_5 : Flag<["-"], "gdwarf-5">, Group<g_Group>,
   HelpText<"Generate source-level debug information with dwarf version 5">;
 def gcodeview : Flag<["-"], "gcodeview">,
   HelpText<"Generate CodeView debug information">,
   Flags<[CC1Option, CC1AsOption, CoreOption]>;
 // Equivalent to our default dwarf version. Forces usual dwarf emission when
 // CodeView is enabled.
 def gdwarf : Flag<["-"], "gdwarf">, Alias<gdwarf_4>, Flags<[CoreOption]>;
 
 def gfull : Flag<["-"], "gfull">, Group<g_Group>;
 def gused : Flag<["-"], "gused">, Group<g_Group>;
 def gstabs : Joined<["-"], "gstabs">, Group<g_Group>, Flags<[Unsupported]>;
 def gcoff : Joined<["-"], "gcoff">, Group<g_Group>, Flags<[Unsupported]>;
 def gxcoff : Joined<["-"], "gxcoff">, Group<g_Group>, Flags<[Unsupported]>;
 def gvms : Joined<["-"], "gvms">, Group<g_Group>, Flags<[Unsupported]>;
 def gtoggle : Flag<["-"], "gtoggle">, Group<g_flags_Group>, Flags<[Unsupported]>;
 def grecord_gcc_switches : Flag<["-"], "grecord-gcc-switches">, Group<g_flags_Group>;
 def gno_record_gcc_switches : Flag<["-"], "gno-record-gcc-switches">,
   Group<g_flags_Group>;
 def gstrict_dwarf : Flag<["-"], "gstrict-dwarf">, Group<g_flags_Group>;
 def gno_strict_dwarf : Flag<["-"], "gno-strict-dwarf">, Group<g_flags_Group>;
 def gcolumn_info : Flag<["-"], "gcolumn-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
 def gno_column_info : Flag<["-"], "gno-column-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
 def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group<g_flags_Group>;
 def ggnu_pubnames : Flag<["-"], "ggnu-pubnames">, Group<g_flags_Group>, Flags<[CC1Option]>;
 def gdwarf_aranges : Flag<["-"], "gdwarf-aranges">, Group<g_flags_Group>;
 def gmodules : Flag <["-"], "gmodules">, Group<gN_Group>,
   HelpText<"Generate debug info with external references to clang modules"
            " or precompiled headers">;
 def gz : Flag<["-"], "gz">, Group<g_flags_Group>,
     HelpText<"DWARF debug sections compression type">;
 def gz_EQ : Joined<["-"], "gz=">, Group<g_flags_Group>,
     HelpText<"DWARF debug sections compression type">;
 def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">;
 def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>,
   HelpText<"Display available options">;
 def index_header_map : Flag<["-"], "index-header-map">, Flags<[CC1Option]>,
   HelpText<"Make the next included directory (-I or -F) an indexer header map">;
 def idirafter : JoinedOrSeparate<["-"], "idirafter">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Add directory to AFTER include search path">;
 def iframework : JoinedOrSeparate<["-"], "iframework">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Add directory to SYSTEM framework search path">;
 def iframeworkwithsysroot : JoinedOrSeparate<["-"], "iframeworkwithsysroot">,
   Group<clang_i_Group>,
   HelpText<"Add directory to SYSTEM framework search path, "
            "absolute paths are relative to -isysroot">,
   MetaVarName<"<directory>">, Flags<[CC1Option]>;
 def imacros : JoinedOrSeparate<["-", "--"], "imacros">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Include macros from file before parsing">, MetaVarName<"<file>">;
 def image__base : Separate<["-"], "image_base">;
 def include_ : JoinedOrSeparate<["-", "--"], "include">, Group<clang_i_Group>, EnumName<"include">,
     MetaVarName<"<file>">, HelpText<"Include file before parsing">, Flags<[CC1Option]>;
 def include_pch : Separate<["-"], "include-pch">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Include precompiled header file">, MetaVarName<"<file>">;
 def relocatable_pch : Flag<["-", "--"], "relocatable-pch">, Flags<[CC1Option]>,
   HelpText<"Whether to build a relocatable precompiled header">;
 def verify_pch : Flag<["-"], "verify-pch">, Group<Action_Group>, Flags<[CC1Option]>,
   HelpText<"Load and verify that a pre-compiled header file is not stale">;
 def init : Separate<["-"], "init">;
 def install__name : Separate<["-"], "install_name">;
 def iprefix : JoinedOrSeparate<["-"], "iprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Set the -iwithprefix/-iwithprefixbefore prefix">, MetaVarName<"<dir>">;
 def iquote : JoinedOrSeparate<["-"], "iquote">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Add directory to QUOTE include search path">, MetaVarName<"<directory>">;
 def isysroot : JoinedOrSeparate<["-"], "isysroot">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Set the system root directory (usually /)">, MetaVarName<"<dir>">;
 def isystem : JoinedOrSeparate<["-"], "isystem">, Group<clang_i_Group>,
   Flags<[CC1Option]>,
   HelpText<"Add directory to SYSTEM include search path">, MetaVarName<"<directory>">;
 def isystem_after : JoinedOrSeparate<["-"], "isystem-after">,
   Group<clang_i_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
   HelpText<"Add directory to end of the SYSTEM include search path">;
 def iwithprefixbefore : JoinedOrSeparate<["-"], "iwithprefixbefore">, Group<clang_i_Group>,
   HelpText<"Set directory to include search path with prefix">, MetaVarName<"<dir>">,
   Flags<[CC1Option]>;
 def iwithprefix : JoinedOrSeparate<["-"], "iwithprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Set directory to SYSTEM include search path with prefix">, MetaVarName<"<dir>">;
 def iwithsysroot : JoinedOrSeparate<["-"], "iwithsysroot">, Group<clang_i_Group>,
   HelpText<"Add directory to SYSTEM include search path, "
            "absolute paths are relative to -isysroot">, MetaVarName<"<directory>">,
   Flags<[CC1Option]>;
 def ivfsoverlay : JoinedOrSeparate<["-"], "ivfsoverlay">, Group<clang_i_Group>, Flags<[CC1Option]>,
   HelpText<"Overlay the virtual filesystem described by file over the real file system">;
 def i : Joined<["-"], "i">, Group<i_Group>;
 def keep__private__externs : Flag<["-"], "keep_private_externs">;
 def l : JoinedOrSeparate<["-"], "l">, Flags<[LinkerInput, RenderJoined]>,
         Group<Link_Group>;
 def lazy__framework : Separate<["-"], "lazy_framework">, Flags<[LinkerInput]>;
 def lazy__library : Separate<["-"], "lazy_library">, Flags<[LinkerInput]>;
 def mlittle_endian : Flag<["-"], "mlittle-endian">, Flags<[DriverOption]>;
 def EL : Flag<["-"], "EL">, Alias<mlittle_endian>;
 def mbig_endian : Flag<["-"], "mbig-endian">, Flags<[DriverOption]>;
 def EB : Flag<["-"], "EB">, Alias<mbig_endian>;
 def m16 : Flag<["-"], "m16">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
 def m32 : Flag<["-"], "m32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
 def mqdsp6_compat : Flag<["-"], "mqdsp6-compat">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
   HelpText<"Enable hexagon-qdsp6 backward compatibility">;
 def m64 : Flag<["-"], "m64">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
 def mx32 : Flag<["-"], "mx32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
 def mabi_EQ : Joined<["-"], "mabi=">, Group<m_Group>;
 def miamcu : Flag<["-"], "miamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>,
   HelpText<"Use Intel MCU ABI">;
 def mno_iamcu : Flag<["-"], "mno-iamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
 def malign_functions_EQ : Joined<["-"], "malign-functions=">, Group<clang_ignored_m_Group>;
 def malign_loops_EQ : Joined<["-"], "malign-loops=">, Group<clang_ignored_m_Group>;
 def malign_jumps_EQ : Joined<["-"], "malign-jumps=">, Group<clang_ignored_m_Group>;
 def mfancy_math_387 : Flag<["-"], "mfancy-math-387">, Group<clang_ignored_m_Group>;
 def mlong_calls : Flag<["-"], "mlong-calls">, Group<m_Group>,
   HelpText<"Generate branches with extended addressability, usually via indirect jumps.">;
 def mno_long_calls : Flag<["-"], "mno-long-calls">, Group<m_Group>,
   HelpText<"Restore the default behaviour of not generating long calls">;
 def mexecute_only : Flag<["-"], "mexecute-only">, Group<m_arm_Features_Group>,
   HelpText<"Disallow generation of data access to code sections (ARM only)">;
 def mno_execute_only : Flag<["-"], "mno-execute-only">, Group<m_arm_Features_Group>,
   HelpText<"Allow generation of data access to code sections (ARM only)">;
 def mtp_mode_EQ : Joined<["-"], "mtp=">, Group<m_arm_Features_Group>, Values<"soft, cp15">,
   HelpText<"Read thread pointer from coprocessor register (ARM only)">;
 def mpure_code : Flag<["-"], "mpure-code">, Alias<mexecute_only>; // Alias for GCC compatibility
 def mno_pure_code : Flag<["-"], "mno-pure-code">, Alias<mno_execute_only>;
 def mtvos_version_min_EQ : Joined<["-"], "mtvos-version-min=">, Group<m_Group>;
 def mappletvos_version_min_EQ : Joined<["-"], "mappletvos-version-min=">, Alias<mtvos_version_min_EQ>;
 def mtvos_simulator_version_min_EQ : Joined<["-"], "mtvos-simulator-version-min=">;
 def mappletvsimulator_version_min_EQ : Joined<["-"], "mappletvsimulator-version-min=">, Alias<mtvos_simulator_version_min_EQ>;
 def mwatchos_version_min_EQ : Joined<["-"], "mwatchos-version-min=">, Group<m_Group>;
 def mwatchos_simulator_version_min_EQ : Joined<["-"], "mwatchos-simulator-version-min=">;
 def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=">, Alias<mwatchos_simulator_version_min_EQ>;
 def march_EQ : Joined<["-"], "march=">, Group<m_Group>;
 def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Flags<[DriverOption]>;
 def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>;
 def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group<m_Group>;
 def mdefault_build_attributes : Joined<["-"], "mdefault-build-attributes">, Group<m_Group>;
 def mno_default_build_attributes : Joined<["-"], "mno-default-build-attributes">, Group<m_Group>;
 def mconstant_cfstrings : Flag<["-"], "mconstant-cfstrings">, Group<clang_ignored_m_Group>;
 def mconsole : Joined<["-"], "mconsole">, Group<m_Group>, Flags<[DriverOption]>;
 def mwindows : Joined<["-"], "mwindows">, Group<m_Group>, Flags<[DriverOption]>;
 def mdll : Joined<["-"], "mdll">, Group<m_Group>, Flags<[DriverOption]>;
 def municode : Joined<["-"], "municode">, Group<m_Group>, Flags<[DriverOption]>;
 def mthreads : Joined<["-"], "mthreads">, Group<m_Group>, Flags<[DriverOption]>;
 def mcpu_EQ : Joined<["-"], "mcpu=">, Group<m_Group>;
 def mmcu_EQ : Joined<["-"], "mmcu=">, Group<m_Group>;
 def mdynamic_no_pic : Joined<["-"], "mdynamic-no-pic">, Group<m_Group>;
 def mfix_and_continue : Flag<["-"], "mfix-and-continue">, Group<clang_ignored_m_Group>;
 def mieee_fp : Flag<["-"], "mieee-fp">, Group<clang_ignored_m_Group>;
 def minline_all_stringops : Flag<["-"], "minline-all-stringops">, Group<clang_ignored_m_Group>;
 def mno_inline_all_stringops : Flag<["-"], "mno-inline-all-stringops">, Group<clang_ignored_m_Group>;
 def malign_double : Flag<["-"], "malign-double">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Align doubles to two words in structs (x86 only)">;
 def mfloat_abi_EQ : Joined<["-"], "mfloat-abi=">, Group<m_Group>, Values<"soft,softfp,hard">;
 def mfpmath_EQ : Joined<["-"], "mfpmath=">, Group<m_Group>;
 def mfpu_EQ : Joined<["-"], "mfpu=">, Group<m_Group>;
 def mhwdiv_EQ : Joined<["-"], "mhwdiv=">, Group<m_Group>;
 def mglobal_merge : Flag<["-"], "mglobal-merge">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Enable merging of globals">;
 def mhard_float : Flag<["-"], "mhard-float">, Group<m_Group>;
 def miphoneos_version_min_EQ : Joined<["-"], "miphoneos-version-min=">, Group<m_Group>;
 def mios_version_min_EQ : Joined<["-"], "mios-version-min=">,
   Alias<miphoneos_version_min_EQ>, HelpText<"Set iOS deployment target">;
 def mios_simulator_version_min_EQ : Joined<["-"], "mios-simulator-version-min=">;
 def miphonesimulator_version_min_EQ : Joined<["-"], "miphonesimulator-version-min=">, Alias<mios_simulator_version_min_EQ>;
 def mkernel : Flag<["-"], "mkernel">, Group<m_Group>;
 def mlinker_version_EQ : Joined<["-"], "mlinker-version=">,
   Flags<[DriverOption]>;
 def mllvm : Separate<["-"], "mllvm">, Flags<[CC1Option,CC1AsOption,CoreOption]>,
   HelpText<"Additional arguments to forward to LLVM's option processing">;
 def mmacosx_version_min_EQ : Joined<["-"], "mmacosx-version-min=">,
   Group<m_Group>, HelpText<"Set Mac OS X deployment target">;
 def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">,
   Group<m_Group>, Alias<mmacosx_version_min_EQ>;
 def mms_bitfields : Flag<["-"], "mms-bitfields">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Set the default structure layout to be compatible with the Microsoft compiler standard">;
 def mno_ms_bitfields : Flag<["-"], "mno-ms-bitfields">, Group<m_Group>,
   HelpText<"Do not set the default structure layout to be compatible with the Microsoft compiler standard">;
 def mstackrealign : Flag<["-"], "mstackrealign">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Force realign the stack at entry to every function">;
 def mstack_alignment : Joined<["-"], "mstack-alignment=">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Set the stack alignment">;
 def mstack_probe_size : Joined<["-"], "mstack-probe-size=">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Set the stack probe size">;
 def mthread_model : Separate<["-"], "mthread-model">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"The thread model to use, e.g. posix, single (posix by default)">, Values<"posix,single">;
 def meabi : Separate<["-"], "meabi">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Set EABI type, e.g. 4, 5 or gnu (default depends on triple)">, Values<"default,4,5,gnu">;
 
 def mno_constant_cfstrings : Flag<["-"], "mno-constant-cfstrings">, Group<m_Group>;
 def mno_global_merge : Flag<["-"], "mno-global-merge">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Disable merging of globals">;
 def mno_pascal_strings : Flag<["-"], "mno-pascal-strings">,
   Alias<fno_pascal_strings>;
 def mno_red_zone : Flag<["-"], "mno-red-zone">, Group<m_Group>;
 def mno_relax_all : Flag<["-"], "mno-relax-all">, Group<m_Group>;
 def mno_rtd: Flag<["-"], "mno-rtd">, Group<m_Group>;
 def mno_soft_float : Flag<["-"], "mno-soft-float">, Group<m_Group>;
 def mno_stackrealign : Flag<["-"], "mno-stackrealign">, Group<m_Group>;
 
 def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_arm_Features_Group>,
   HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64 only)">;
 def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_arm_Features_Group>,
   HelpText<"Force all memory accesses to be aligned (AArch32/AArch64 only)">;
 def mstrict_align : Flag<["-"], "mstrict-align">, Alias<mno_unaligned_access>, Flags<[CC1Option,HelpHidden]>,
   HelpText<"Force all memory accesses to be aligned (same as mno-unaligned-access)">;
 def mno_thumb : Flag<["-"], "mno-thumb">, Group<m_arm_Features_Group>;
 def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
   HelpText<"Disallow generation of deprecated IT blocks for ARMv8. It is on by default for ARMv8 Thumb mode.">;
 def mno_restrict_it: Flag<["-"], "mno-restrict-it">, Group<m_arm_Features_Group>,
   HelpText<"Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode">;
 def marm : Flag<["-"], "marm">, Alias<mno_thumb>;
 def ffixed_r9 : Flag<["-"], "ffixed-r9">, Group<m_arm_Features_Group>,
   HelpText<"Reserve the r9 register (ARM only)">;
 def mno_movt : Flag<["-"], "mno-movt">, Group<m_arm_Features_Group>,
   HelpText<"Disallow use of movt/movw pairs (ARM only)">;
 def mcrc : Flag<["-"], "mcrc">, Group<m_arm_Features_Group>,
   HelpText<"Allow use of CRC instructions (ARM only)">;
 def mnocrc : Flag<["-"], "mnocrc">, Group<m_arm_Features_Group>,
   HelpText<"Disallow use of CRC instructions (ARM only)">;
 def mno_neg_immediates: Flag<["-"], "mno-neg-immediates">, Group<m_arm_Features_Group>,
   HelpText<"Disallow converting instructions with negative immediates to their negation or inversion.">;
 
 def mgeneral_regs_only : Flag<["-"], "mgeneral-regs-only">, Group<m_aarch64_Features_Group>,
   HelpText<"Generate code which only uses the general purpose registers (AArch64 only)">;
 def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
   Group<m_aarch64_Features_Group>,
   HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">;
 def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
   Group<m_aarch64_Features_Group>,
   HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
 def ffixed_x18 : Flag<["-"], "ffixed-x18">, Group<m_aarch64_Features_Group>,
   HelpText<"Reserve the x18 register (AArch64 only)">;
 
 def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
 def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;
 def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group<m_wasm_Features_Group>;
 def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group<m_wasm_Features_Group>;
 
 def mamdgpu_debugger_abi : Joined<["-"], "mamdgpu-debugger-abi=">,
   Flags<[HelpHidden]>,
   Group<m_Group>,
   HelpText<"Generate additional code for specified <version> of debugger ABI (AMDGPU only)">,
   MetaVarName<"<version>">;
 def mxnack : Flag<["-"], "mxnack">, Group<m_amdgpu_Features_Group>,
   HelpText<"Enable XNACK (AMDGPU only)">;
 def mno_xnack : Flag<["-"], "mno-xnack">, Group<m_amdgpu_Features_Group>,
   HelpText<"Disable XNACK (AMDGPU only)">;
 
 def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[DriverOption]>;
 def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>, Flags<[DriverOption]>;
 def maltivec : Flag<["-"], "maltivec">, Group<m_ppc_Features_Group>;
 def mno_altivec : Flag<["-"], "mno-altivec">, Group<m_ppc_Features_Group>;
 def mvsx : Flag<["-"], "mvsx">, Group<m_ppc_Features_Group>;
 def mno_vsx : Flag<["-"], "mno-vsx">, Group<m_ppc_Features_Group>;
 def mpower8_vector : Flag<["-"], "mpower8-vector">,
     Group<m_ppc_Features_Group>;
 def mno_power8_vector : Flag<["-"], "mno-power8-vector">,
     Group<m_ppc_Features_Group>;
 def mpower9_vector : Flag<["-"], "mpower9-vector">,
     Group<m_ppc_Features_Group>;
 def mno_power9_vector : Flag<["-"], "mno-power9-vector">,
     Group<m_ppc_Features_Group>;
 def mpower8_crypto : Flag<["-"], "mcrypto">,
     Group<m_ppc_Features_Group>;
 def mnopower8_crypto : Flag<["-"], "mno-crypto">,
     Group<m_ppc_Features_Group>;
 def mdirect_move : Flag<["-"], "mdirect-move">,
     Group<m_ppc_Features_Group>;
 def mnodirect_move : Flag<["-"], "mno-direct-move">,
     Group<m_ppc_Features_Group>;
 def mhtm : Flag<["-"], "mhtm">, Group<m_ppc_Features_Group>;
 def mno_htm : Flag<["-"], "mno-htm">, Group<m_ppc_Features_Group>;
 def mfprnd : Flag<["-"], "mfprnd">, Group<m_ppc_Features_Group>;
 def mno_fprnd : Flag<["-"], "mno-fprnd">, Group<m_ppc_Features_Group>;
 def mcmpb : Flag<["-"], "mcmpb">, Group<m_ppc_Features_Group>;
 def mno_cmpb : Flag<["-"], "mno-cmpb">, Group<m_ppc_Features_Group>;
 def misel : Flag<["-"], "misel">, Group<m_ppc_Features_Group>;
 def mno_isel : Flag<["-"], "mno-isel">, Group<m_ppc_Features_Group>;
 def mmfocrf : Flag<["-"], "mmfocrf">, Group<m_ppc_Features_Group>;
 def mmfcrf : Flag<["-"], "mmfcrf">, Alias<mmfocrf>;
 def mno_mfocrf : Flag<["-"], "mno-mfocrf">, Group<m_ppc_Features_Group>;
 def mno_mfcrf : Flag<["-"], "mno-mfcrf">, Alias<mno_mfocrf>;
 def mpopcntd : Flag<["-"], "mpopcntd">, Group<m_ppc_Features_Group>;
 def mno_popcntd : Flag<["-"], "mno-popcntd">, Group<m_ppc_Features_Group>;
 def mqpx : Flag<["-"], "mqpx">, Group<m_ppc_Features_Group>;
 def mno_qpx : Flag<["-"], "mno-qpx">, Group<m_ppc_Features_Group>;
 def mcrbits : Flag<["-"], "mcrbits">, Group<m_ppc_Features_Group>;
 def mno_crbits : Flag<["-"], "mno-crbits">, Group<m_ppc_Features_Group>;
 def minvariant_function_descriptors :
   Flag<["-"], "minvariant-function-descriptors">, Group<m_ppc_Features_Group>;
 def mno_invariant_function_descriptors :
   Flag<["-"], "mno-invariant-function-descriptors">,
   Group<m_ppc_Features_Group>;
 def mfloat128: Flag<["-"], "mfloat128">,
     Group<m_ppc_Features_Group>;
 def mno_float128 : Flag<["-"], "mno-float128">,
     Group<m_ppc_Features_Group>;
 def mlongcall: Flag<["-"], "mlongcall">,
     Group<m_ppc_Features_Group>;
 def mno_longcall : Flag<["-"], "mno-longcall">,
     Group<m_ppc_Features_Group>;
 
 def mvx : Flag<["-"], "mvx">, Group<m_Group>;
 def mno_vx : Flag<["-"], "mno-vx">, Group<m_Group>;
 
 def fzvector : Flag<["-"], "fzvector">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Enable System z vector language extension">;
 def fno_zvector : Flag<["-"], "fno-zvector">, Group<f_Group>,
   Flags<[CC1Option]>;
 def mzvector : Flag<["-"], "mzvector">, Alias<fzvector>;
 def mno_zvector : Flag<["-"], "mno-zvector">, Alias<fno_zvector>;
 
 def mbackchain : Flag<["-"], "mbackchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
   HelpText<"Link stack frames through backchain on System Z">;
 def mno_backchain : Flag<["-"], "mno-backchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>;
 
 def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings">, Group<m_Group>;
 def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group<m_Group>;
 def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group<m_Group>,
   HelpText<"Omit frame pointer setup for leaf functions">, Flags<[CC1Option]>;
 def moslib_EQ : Joined<["-"], "moslib=">, Group<m_Group>;
 def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias<fpascal_strings>;
 def mred_zone : Flag<["-"], "mred-zone">, Group<m_Group>;
 def mregparm_EQ : Joined<["-"], "mregparm=">, Group<m_Group>;
 def mrelax_all : Flag<["-"], "mrelax-all">, Group<m_Group>, Flags<[CC1Option,CC1AsOption]>,
   HelpText<"(integrated-as) Relax all machine instructions">;
 def mincremental_linker_compatible : Flag<["-"], "mincremental-linker-compatible">, Group<m_Group>,
   Flags<[CC1Option,CC1AsOption]>,
   HelpText<"(integrated-as) Emit an object file which can be used with an incremental linker">;
 def mno_incremental_linker_compatible : Flag<["-"], "mno-incremental-linker-compatible">, Group<m_Group>,
   HelpText<"(integrated-as) Emit an object file which cannot be used with an incremental linker">;
 def mrtd : Flag<["-"], "mrtd">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Make StdCall calling convention the default">;
 def msmall_data_threshold_EQ : Joined <["-"], "msmall-data-threshold=">,
   Group<m_Group>, Alias<G>;
 def msoft_float : Flag<["-"], "msoft-float">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Use software floating point">;
 def mno_implicit_float : Flag<["-"], "mno-implicit-float">, Group<m_Group>,
   HelpText<"Don't generate implicit floating point instructions">;
 def mimplicit_float : Flag<["-"], "mimplicit-float">, Group<m_Group>;
 def mrecip : Flag<["-"], "mrecip">, Group<m_Group>;
 def mrecip_EQ : CommaJoined<["-"], "mrecip=">, Group<m_Group>, Flags<[CC1Option]>;
 def mprefer_vector_width_EQ : Joined<["-"], "mprefer-vector-width=">, Group<m_Group>, Flags<[CC1Option]>,
   HelpText<"Specifies preferred vector width for auto-vectorization. Defaults to 'none' which allows target specific decisions.">;
 def mpie_copy_relocations : Flag<["-"], "mpie-copy-relocations">, Group<m_Group>,
   Flags<[CC1Option]>,
   HelpText<"Use copy relocations support for PIE builds">;
 def mno_pie_copy_relocations : Flag<["-"], "mno-pie-copy-relocations">, Group<m_Group>;
 def mfentry : Flag<["-"], "mfentry">, HelpText<"Insert calls to fentry at function entry (x86 only)">,
   Flags<[CC1Option]>, Group<m_Group>;
 def mips16 : Flag<["-"], "mips16">, Group<m_Group>;
 def mno_mips16 : Flag<["-"], "mno-mips16">, Group<m_Group>;
 def mmicromips : Flag<["-"], "mmicromips">, Group<m_Group>;
 def mno_micromips : Flag<["-"], "mno-micromips">, Group<m_Group>;
 def mxgot : Flag<["-"], "mxgot">, Group<m_Group>;
 def mno_xgot : Flag<["-"], "mno-xgot">, Group<m_Group>;
 def mldc1_sdc1 : Flag<["-"], "mldc1-sdc1">, Group<m_Group>;
 def mno_ldc1_sdc1 : Flag<["-"], "mno-ldc1-sdc1">, Group<m_Group>;
 def mcheck_zero_division : Flag<["-"], "mcheck-zero-division">, Group<m_Group>;
 def mno_check_zero_division : Flag<["-"], "mno-check-zero-division">,
                               Group<m_Group>;
 def mcompact_branches_EQ : Joined<["-"], "mcompact-branches=">, Group<m_Group>;
 def mbranch_likely : Flag<["-"], "mbranch-likely">, Group<m_Group>,
   IgnoredGCCCompat;
 def mno_branch_likely : Flag<["-"], "mno-branch-likely">, Group<m_Group>,
   IgnoredGCCCompat;
 def mdsp : Flag<["-"], "mdsp">, Group<m_Group>;
 def mno_dsp : Flag<["-"], "mno-dsp">, Group<m_Group>;
 def mdspr2 : Flag<["-"], "mdspr2">, Group<m_Group>;
 def mno_dspr2 : Flag<["-"], "mno-dspr2">, Group<m_Group>;
 def msingle_float : Flag<["-"], "msingle-float">, Group<m_Group>;
 def mdouble_float : Flag<["-"], "mdouble-float">, Group<m_Group>;
 def mmadd4 : Flag<["-"], "mmadd4">, Group<m_Group>,
   HelpText<"Enable the generation of 4-operand madd.s, madd.d and related instructions.">;
 def mno_madd4 : Flag<["-"], "mno-madd4">, Group<m_Group>,
   HelpText<"Disable the generation of 4-operand madd.s, madd.d and related instructions.">;
 def mmsa : Flag<["-"], "mmsa">, Group<m_Group>,
   HelpText<"Enable MSA ASE (MIPS only)">;
 def mno_msa : Flag<["-"], "mno-msa">, Group<m_Group>,
   HelpText<"Disable MSA ASE (MIPS only)">;
 def mmt : Flag<["-"], "mmt">, Group<m_Group>,
   HelpText<"Enable MT ASE (MIPS only)">;
 def mno_mt : Flag<["-"], "mno-mt">, Group<m_Group>,
   HelpText<"Disable MT ASE (MIPS only)">;
 def mfp64 : Flag<["-"], "mfp64">, Group<m_Group>,
   HelpText<"Use 64-bit floating point registers (MIPS only)">;
 def mfp32 : Flag<["-"], "mfp32">, Group<m_Group>,
   HelpText<"Use 32-bit floating point registers (MIPS only)">;
 def mgpopt : Flag<["-"], "mgpopt">, Group<m_Group>,
   HelpText<"Use GP relative accesses for symbols known to be in a small"
            " data section (MIPS)">;
 def mno_gpopt : Flag<["-"], "mno-gpopt">, Group<m_Group>,
   HelpText<"Do not use GP relative accesses for symbols known to be in a small"
            " data section (MIPS)">;
 def mlocal_sdata : Flag<["-"], "mlocal-sdata">, Group<m_Group>,
   HelpText<"Extend the -G behaviour to object local data (MIPS)">;
 def mno_local_sdata : Flag<["-"], "mno-local-sdata">, Group<m_Group>,
   HelpText<"Do not extend the -G behaviour to object local data (MIPS)">;
 def mextern_sdata : Flag<["-"], "mextern-sdata">, Group<m_Group>,
   HelpText<"Assume that externally defined data is in the small data if it"
            " meets the -G <size> threshold (MIPS)">;
 def mno_extern_sdata : Flag<["-"], "mno-extern-sdata">, Group<m_Group>,
   HelpText<"Do not assume that externally defined data is in the small data if"
            " it meets the -G <size> threshold (MIPS)">;
 def membedded_data : Flag<["-"], "membedded-data">, Group<m_Group>,
   HelpText<"Place constants in the .rodata section instead of the .sdata "
            "section even if they meet the -G <size> threshold (MIPS)">;
 def mno_embedded_data : Flag<["-"], "mno-embedded-data">, Group<m_Group>,
   HelpText<"Do not place constants in the .rodata section instead of the "
            ".sdata if they meet the -G <size> threshold (MIPS)">;
 def mnan_EQ : Joined<["-"], "mnan=">, Group<m_Group>;
 def mabs_EQ : Joined<["-"], "mabs=">, Group<m_Group>;
 def mabicalls : Flag<["-"], "mabicalls">, Group<m_Group>,
   HelpText<"Enable SVR4-style position-independent code (Mips only)">;
 def mno_abicalls : Flag<["-"], "mno-abicalls">, Group<m_Group>,
   HelpText<"Disable SVR4-style position-independent code (Mips only)">;
 def mips1 : Flag<["-"], "mips1">,
   Alias<march_EQ>, AliasArgs<["mips1"]>,
   HelpText<"Equivalent to -march=mips1">, Flags<[HelpHidden]>;
 def mips2 : Flag<["-"], "mips2">,
   Alias<march_EQ>, AliasArgs<["mips2"]>,
   HelpText<"Equivalent to -march=mips2">, Flags<[HelpHidden]>;
 def mips3 : Flag<["-"], "mips3">,
   Alias<march_EQ>, AliasArgs<["mips3"]>,
   HelpText<"Equivalent to -march=mips3">, Flags<[HelpHidden]>;
 def mips4 : Flag<["-"], "mips4">,
   Alias<march_EQ>, AliasArgs<["mips4"]>,
   HelpText<"Equivalent to -march=mips4">, Flags<[HelpHidden]>;
 def mips5 : Flag<["-"], "mips5">,
   Alias<march_EQ>, AliasArgs<["mips5"]>,
   HelpText<"Equivalent to -march=mips5">, Flags<[HelpHidden]>;
 def mips32 : Flag<["-"], "mips32">,
   Alias<march_EQ>, AliasArgs<["mips32"]>,
   HelpText<"Equivalent to -march=mips32">, Flags<[HelpHidden]>;
 def mips32r2 : Flag<["-"], "mips32r2">,
   Alias<march_EQ>, AliasArgs<["mips32r2"]>,
   HelpText<"Equivalent to -march=mips32r2">, Flags<[HelpHidden]>;
 def mips32r3 : Flag<["-"], "mips32r3">,
   Alias<march_EQ>, AliasArgs<["mips32r3"]>,
   HelpText<"Equivalent to -march=mips32r3">, Flags<[HelpHidden]>;
 def mips32r5 : Flag<["-"], "mips32r5">,
   Alias<march_EQ>, AliasArgs<["mips32r5"]>,
   HelpText<"Equivalent to -march=mips32r5">, Flags<[HelpHidden]>;
 def mips32r6 : Flag<["-"], "mips32r6">,
   Alias<march_EQ>, AliasArgs<["mips32r6"]>,
   HelpText<"Equivalent to -march=mips32r6">, Flags<[HelpHidden]>;
 def mips64 : Flag<["-"], "mips64">,
   Alias<march_EQ>, AliasArgs<["mips64"]>,
   HelpText<"Equivalent to -march=mips64">, Flags<[HelpHidden]>;
 def mips64r2 : Flag<["-"], "mips64r2">,
   Alias<march_EQ>, AliasArgs<["mips64r2"]>,
   HelpText<"Equivalent to -march=mips64r2">, Flags<[HelpHidden]>;
 def mips64r3 : Flag<["-"], "mips64r3">,
   Alias<march_EQ>, AliasArgs<["mips64r3"]>,
   HelpText<"Equivalent to -march=mips64r3">, Flags<[HelpHidden]>;
 def mips64r5 : Flag<["-"], "mips64r5">,
   Alias<march_EQ>, AliasArgs<["mips64r5"]>,
   HelpText<"Equivalent to -march=mips64r5">, Flags<[HelpHidden]>;
 def mips64r6 : Flag<["-"], "mips64r6">,
   Alias<march_EQ>, AliasArgs<["mips64r6"]>,
   HelpText<"Equivalent to -march=mips64r6">, Flags<[HelpHidden]>;
 def mfpxx : Flag<["-"], "mfpxx">, Group<m_Group>,
   HelpText<"Avoid FPU mode dependent operations when used with the O32 ABI">,
   Flags<[HelpHidden]>;
 def modd_spreg : Flag<["-"], "modd-spreg">, Group<m_Group>,
   HelpText<"Enable odd single-precision floating point registers">,
   Flags<[HelpHidden]>;
 def mno_odd_spreg : Flag<["-"], "mno-odd-spreg">, Group<m_Group>,
   HelpText<"Disable odd single-precision floating point registers">,
   Flags<[HelpHidden]>;
 def mglibc : Flag<["-"], "mglibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
 def muclibc : Flag<["-"], "muclibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
 def module_file_info : Flag<["-"], "module-file-info">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
   HelpText<"Provide information about a particular module file">;
 def mthumb : Flag<["-"], "mthumb">, Group<m_Group>;
 def mtune_EQ : Joined<["-"], "mtune=">, Group<m_Group>;
 def multi__module : Flag<["-"], "multi_module">;
 def multiply__defined__unused : Separate<["-"], "multiply_defined_unused">;
 def multiply__defined : Separate<["-"], "multiply_defined">;
 def mwarn_nonportable_cfstrings : Flag<["-"], "mwarn-nonportable-cfstrings">, Group<m_Group>;
 def no_canonical_prefixes : Flag<["-"], "no-canonical-prefixes">, Flags<[HelpHidden]>,
   HelpText<"Use relative instead of canonical paths">;
 def no_cpp_precomp : Flag<["-"], "no-cpp-precomp">, Group<clang_ignored_f_Group>;
 def no_integrated_cpp : Flag<["-", "--"], "no-integrated-cpp">, Flags<[DriverOption]>;
 def no_pedantic : Flag<["-", "--"], "no-pedantic">, Group<pedantic_Group>;
 def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
 def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option, CoreOption]>,
   HelpText<"Disable builtin #include directories">;
 def nocudainc : Flag<["-"], "nocudainc">;
 def nocudalib : Flag<["-"], "nocudalib">;
 def nodefaultlibs : Flag<["-"], "nodefaultlibs">;
 def nofixprebinding : Flag<["-"], "nofixprebinding">;
 def nolibc : Flag<["-"], "nolibc">;
 def nomultidefs : Flag<["-"], "nomultidefs">;
 def nopie : Flag<["-"], "nopie">;
 def no_pie : Flag<["-"], "no-pie">, Alias<nopie>;
 def noprebind : Flag<["-"], "noprebind">;
 def noseglinkedit : Flag<["-"], "noseglinkedit">;
 def nostartfiles : Flag<["-"], "nostartfiles">;
 def nostdinc : Flag<["-"], "nostdinc">, Flags<[CoreOption]>;
 def nostdlibinc : Flag<["-"], "nostdlibinc">;
 def nostdincxx : Flag<["-"], "nostdinc++">, Flags<[CC1Option]>,
   HelpText<"Disable standard #include directories for the C++ standard library">;
 def nostdlib : Flag<["-"], "nostdlib">;
 def nostdlibxx : Flag<["-"], "nostdlib++">;
 def object : Flag<["-"], "object">;
 def o : JoinedOrSeparate<["-"], "o">, Flags<[DriverOption, RenderAsInput, CC1Option, CC1AsOption]>,
   HelpText<"Write output to <file>">, MetaVarName<"<file>">;
 def pagezero__size : JoinedOrSeparate<["-"], "pagezero_size">;
 def pass_exit_codes : Flag<["-", "--"], "pass-exit-codes">, Flags<[Unsupported]>;
 def pedantic_errors : Flag<["-", "--"], "pedantic-errors">, Group<pedantic_Group>, Flags<[CC1Option]>;
 def pedantic : Flag<["-", "--"], "pedantic">, Group<pedantic_Group>, Flags<[CC1Option]>;
 def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">, Flags<[CC1Option]>;
 def pipe : Flag<["-", "--"], "pipe">,
   HelpText<"Use pipes between commands, when possible">;
 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
 def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
 def print_file_name_EQ : Joined<["-", "--"], "print-file-name=">,
   HelpText<"Print the full library path of <file>">, MetaVarName<"<file>">;
 def print_ivar_layout : Flag<["-"], "print-ivar-layout">, Flags<[CC1Option]>,
   HelpText<"Enable Objective-C Ivar layout bitmap print trace">;
 def print_libgcc_file_name : Flag<["-", "--"], "print-libgcc-file-name">,
   HelpText<"Print the library path for the currently used compiler runtime "
            "library (\"libgcc.a\" or \"libclang_rt.builtins.*.a\")">;
 def print_multi_directory : Flag<["-", "--"], "print-multi-directory">;
 def print_multi_lib : Flag<["-", "--"], "print-multi-lib">;
 def print_multi_os_directory : Flag<["-", "--"], "print-multi-os-directory">,
   Flags<[Unsupported]>;
 def print_prog_name_EQ : Joined<["-", "--"], "print-prog-name=">,
   HelpText<"Print the full program path of <name>">, MetaVarName<"<name>">;
 def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
   HelpText<"Print the resource directory pathname">;
 def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
   HelpText<"Print the paths used for finding libraries and programs">;
 def private__bundle : Flag<["-"], "private_bundle">;
 def pthreads : Flag<["-"], "pthreads">;
 def pthread : Flag<["-"], "pthread">, Flags<[CC1Option]>,
   HelpText<"Support POSIX threads in generated code">;
 def no_pthread : Flag<["-"], "no-pthread">, Flags<[CC1Option]>;
 def p : Flag<["-"], "p">;
 def pie : Flag<["-"], "pie">;
 def read__only__relocs : Separate<["-"], "read_only_relocs">;
 def remap : Flag<["-"], "remap">;
 def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[DriverOption,CC1Option]>,
   HelpText<"Rewrite Objective-C source to C++">, Group<Action_Group>;
 def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">, Flags<[DriverOption]>,
   HelpText<"Rewrite Legacy Objective-C source to C++">;
 def rdynamic : Flag<["-"], "rdynamic">;
 def resource_dir : Separate<["-"], "resource-dir">,
   Flags<[DriverOption, CC1Option, CoreOption, HelpHidden]>,
   HelpText<"The directory which holds the compiler resource files">;
 def resource_dir_EQ : Joined<["-"], "resource-dir=">, Flags<[DriverOption, CoreOption]>,
   Alias<resource_dir>;
 def rpath : Separate<["-"], "rpath">, Flags<[LinkerInput]>, Group<Link_Group>;
 def rtlib_EQ : Joined<["-", "--"], "rtlib=">,
   HelpText<"Compiler runtime library to use">;
 def frtlib_add_rpath: Flag<["-"], "frtlib-add-rpath">, Flags<[NoArgumentUnused]>,
   HelpText<"Add -rpath with architecture-specific resource directory to the linker flags">;
 def fno_rtlib_add_rpath: Flag<["-"], "fno-rtlib-add-rpath">, Flags<[NoArgumentUnused]>,
   HelpText<"Do not add -rpath with architecture-specific resource directory to the linker flags">;
 def r : Flag<["-"], "r">, Flags<[LinkerInput,NoArgumentUnused]>,
         Group<Link_Group>;
 def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[DriverOption]>,
   HelpText<"Save intermediate compilation results.">;
 def save_temps : Flag<["-", "--"], "save-temps">, Flags<[DriverOption]>,
   Alias<save_temps_EQ>, AliasArgs<["cwd"]>,
   HelpText<"Save intermediate compilation results">;
 def save_stats_EQ : Joined<["-", "--"], "save-stats=">, Flags<[DriverOption]>,
   HelpText<"Save llvm statistics.">;
 def save_stats : Flag<["-", "--"], "save-stats">, Flags<[DriverOption]>,
   Alias<save_stats_EQ>, AliasArgs<["cwd"]>,
   HelpText<"Save llvm statistics.">;
 def via_file_asm : Flag<["-", "--"], "via-file-asm">, InternalDebugOpt,
   HelpText<"Write assembly to file for input to assemble jobs">;
 def sectalign : MultiArg<["-"], "sectalign", 3>;
 def sectcreate : MultiArg<["-"], "sectcreate", 3>;
 def sectobjectsymbols : MultiArg<["-"], "sectobjectsymbols", 2>;
 def sectorder : MultiArg<["-"], "sectorder", 3>;
 def seg1addr : JoinedOrSeparate<["-"], "seg1addr">;
 def seg__addr__table__filename : Separate<["-"], "seg_addr_table_filename">;
 def seg__addr__table : Separate<["-"], "seg_addr_table">;
 def segaddr : MultiArg<["-"], "segaddr", 2>;
 def segcreate : MultiArg<["-"], "segcreate", 3>;
 def seglinkedit : Flag<["-"], "seglinkedit">;
 def segprot : MultiArg<["-"], "segprot", 3>;
 def segs__read__only__addr : Separate<["-"], "segs_read_only_addr">;
 def segs__read__write__addr : Separate<["-"], "segs_read_write_addr">;
 def segs__read__ : Joined<["-"], "segs_read_">;
 def shared_libgcc : Flag<["-"], "shared-libgcc">;
 def shared : Flag<["-", "--"], "shared">;
 def single__module : Flag<["-"], "single_module">;
 def specs_EQ : Joined<["-", "--"], "specs=">;
 def specs : Separate<["-", "--"], "specs">, Flags<[Unsupported]>;
 def static_libgcc : Flag<["-"], "static-libgcc">;
 def static_libstdcxx : Flag<["-"], "static-libstdc++">;
 def static : Flag<["-", "--"], "static">, Flags<[NoArgumentUnused]>;
 def std_default_EQ : Joined<["-"], "std-default=">;
 def std_EQ : Joined<["-", "--"], "std=">, Flags<[CC1Option]>,
   Group<CompileOnly_Group>, HelpText<"Language standard to compile for">,
   ValuesCode<[{
     const char *Values =
     #define LANGSTANDARD(id, name, lang, desc, features) name ","
     #define LANGSTANDARD_ALIAS(id, alias) alias ","
     #include "clang/Frontend/LangStandards.def"
     ;
   }]>;
 def stdlib_EQ : Joined<["-", "--"], "stdlib=">, Flags<[CC1Option]>,
   HelpText<"C++ standard library to use">, Values<"libc++,libstdc++,platform">;
 def sub__library : JoinedOrSeparate<["-"], "sub_library">;
 def sub__umbrella : JoinedOrSeparate<["-"], "sub_umbrella">;
 def system_header_prefix : Joined<["--"], "system-header-prefix=">,
   Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
   HelpText<"Treat all #include paths starting with <prefix> as including a "
            "system header.">;
 def : Separate<["--"], "system-header-prefix">, Alias<system_header_prefix>;
 def no_system_header_prefix : Joined<["--"], "no-system-header-prefix=">,
   Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
   HelpText<"Treat all #include paths starting with <prefix> as not including a "
            "system header.">;
 def : Separate<["--"], "no-system-header-prefix">, Alias<no_system_header_prefix>;
 def s : Flag<["-"], "s">, Group<Link_Group>;
 def target : Joined<["--"], "target=">, Flags<[DriverOption, CoreOption]>,
   HelpText<"Generate code for the given target">;
 def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[DriverOption]>,
   HelpText<"Use the gcc toolchain at the given directory">;
 def time : Flag<["-"], "time">,
   HelpText<"Time individual commands">;
 def traditional_cpp : Flag<["-", "--"], "traditional-cpp">, Flags<[CC1Option]>,
   HelpText<"Enable some traditional CPP emulation">;
 def traditional : Flag<["-", "--"], "traditional">;
 def trigraphs : Flag<["-", "--"], "trigraphs">, Alias<ftrigraphs>,
   HelpText<"Process trigraph sequences">;
 def twolevel__namespace__hints : Flag<["-"], "twolevel_namespace_hints">;
 def twolevel__namespace : Flag<["-"], "twolevel_namespace">;
 def t : Flag<["-"], "t">, Group<Link_Group>;
 def umbrella : Separate<["-"], "umbrella">;
 def undefined : JoinedOrSeparate<["-"], "undefined">, Group<u_Group>;
 def undef : Flag<["-"], "undef">, Group<u_Group>, Flags<[CC1Option]>,
   HelpText<"undef all system defines">;
 def unexported__symbols__list : Separate<["-"], "unexported_symbols_list">;
 def u : JoinedOrSeparate<["-"], "u">, Group<u_Group>;
 def v : Flag<["-"], "v">, Flags<[CC1Option, CoreOption]>,
   HelpText<"Show commands to run and use verbose output">;
 def verify_debug_info : Flag<["--"], "verify-debug-info">, Flags<[DriverOption]>,
   HelpText<"Verify the binary representation of debug output">;
 def weak_l : Joined<["-"], "weak-l">, Flags<[LinkerInput]>;
 def weak__framework : Separate<["-"], "weak_framework">, Flags<[LinkerInput]>;
 def weak__library : Separate<["-"], "weak_library">, Flags<[LinkerInput]>;
 def weak__reference__mismatches : Separate<["-"], "weak_reference_mismatches">;
 def whatsloaded : Flag<["-"], "whatsloaded">;
 def whyload : Flag<["-"], "whyload">;
 def w : Flag<["-"], "w">, HelpText<"Suppress all warnings">, Flags<[CC1Option]>;
 def x : JoinedOrSeparate<["-"], "x">, Flags<[DriverOption,CC1Option]>,
   HelpText<"Treat subsequent input files as having type <language>">,
   MetaVarName<"<language>">;
 def y : Joined<["-"], "y">;
 
 def fintegrated_as : Flag<["-"], "fintegrated-as">, Flags<[DriverOption]>,
                      Group<f_Group>, HelpText<"Enable the integrated assembler">;
 def fno_integrated_as : Flag<["-"], "fno-integrated-as">,
                         Flags<[CC1Option, DriverOption]>, Group<f_Group>,
                         HelpText<"Disable the integrated assembler">;
 def : Flag<["-"], "integrated-as">, Alias<fintegrated_as>, Flags<[DriverOption]>;
 def : Flag<["-"], "no-integrated-as">, Alias<fno_integrated_as>,
       Flags<[CC1Option, DriverOption]>;
 
 def working_directory : JoinedOrSeparate<["-"], "working-directory">, Flags<[CC1Option]>,
   HelpText<"Resolve file paths relative to the specified directory">;
 def working_directory_EQ : Joined<["-"], "working-directory=">, Flags<[CC1Option]>,
   Alias<working_directory>;
 
 // Double dash options, which are usually an alias for one of the previous
 // options.
 
 def _mhwdiv_EQ : Joined<["--"], "mhwdiv=">, Alias<mhwdiv_EQ>;
 def _mhwdiv : Separate<["--"], "mhwdiv">, Alias<mhwdiv_EQ>;
 def _CLASSPATH_EQ : Joined<["--"], "CLASSPATH=">, Alias<fclasspath_EQ>;
 def _CLASSPATH : Separate<["--"], "CLASSPATH">, Alias<fclasspath_EQ>;
 def _all_warnings : Flag<["--"], "all-warnings">, Alias<Wall>;
 def _analyze_auto : Flag<["--"], "analyze-auto">, Flags<[DriverOption]>;
 def _analyzer_no_default_checks : Flag<["--"], "analyzer-no-default-checks">, Flags<[DriverOption]>;
 def _analyzer_output : JoinedOrSeparate<["--"], "analyzer-output">, Flags<[DriverOption]>,
   HelpText<"Static analyzer report output format (html|plist|plist-multi-file|plist-html|text).">;
 def _analyze : Flag<["--"], "analyze">, Flags<[DriverOption, CoreOption]>,
   HelpText<"Run the static analyzer">;
 def _assemble : Flag<["--"], "assemble">, Alias<S>;
 def _assert_EQ : Joined<["--"], "assert=">, Alias<A>;
 def _assert : Separate<["--"], "assert">, Alias<A>;
 def _bootclasspath_EQ : Joined<["--"], "bootclasspath=">, Alias<fbootclasspath_EQ>;
 def _bootclasspath : Separate<["--"], "bootclasspath">, Alias<fbootclasspath_EQ>;
 def _classpath_EQ : Joined<["--"], "classpath=">, Alias<fclasspath_EQ>;
 def _classpath : Separate<["--"], "classpath">, Alias<fclasspath_EQ>;
 def _comments_in_macros : Flag<["--"], "comments-in-macros">, Alias<CC>;
 def _comments : Flag<["--"], "comments">, Alias<C>;
 def _compile : Flag<["--"], "compile">, Alias<c>;
 def _constant_cfstrings : Flag<["--"], "constant-cfstrings">;
 def _debug_EQ : Joined<["--"], "debug=">, Alias<g_Flag>;
 def _debug : Flag<["--"], "debug">, Alias<g_Flag>;
 def _define_macro_EQ : Joined<["--"], "define-macro=">, Alias<D>;
 def _define_macro : Separate<["--"], "define-macro">, Alias<D>;
 def _dependencies : Flag<["--"], "dependencies">, Alias<M>;
 def _dyld_prefix_EQ : Joined<["--"], "dyld-prefix=">;
 def _dyld_prefix : Separate<["--"], "dyld-prefix">, Alias<_dyld_prefix_EQ>;
 def _encoding_EQ : Joined<["--"], "encoding=">, Alias<fencoding_EQ>;
 def _encoding : Separate<["--"], "encoding">, Alias<fencoding_EQ>;
 def _entry : Flag<["--"], "entry">, Alias<e>;
 def _extdirs_EQ : Joined<["--"], "extdirs=">, Alias<fextdirs_EQ>;
 def _extdirs : Separate<["--"], "extdirs">, Alias<fextdirs_EQ>;
 def _extra_warnings : Flag<["--"], "extra-warnings">, Alias<W_Joined>;
 def _for_linker_EQ : Joined<["--"], "for-linker=">, Alias<Xlinker>;
 def _for_linker : Separate<["--"], "for-linker">, Alias<Xlinker>;
 def _force_link_EQ : Joined<["--"], "force-link=">, Alias<u>;
 def _force_link : Separate<["--"], "force-link">, Alias<u>;
 def _help_hidden : Flag<["--"], "help-hidden">;
 def _imacros_EQ : Joined<["--"], "imacros=">, Alias<imacros>;
 def _include_barrier : Flag<["--"], "include-barrier">, Alias<I_>;
 def _include_directory_after_EQ : Joined<["--"], "include-directory-after=">, Alias<idirafter>;
 def _include_directory_after : Separate<["--"], "include-directory-after">, Alias<idirafter>;
 def _include_directory_EQ : Joined<["--"], "include-directory=">, Alias<I>;
 def _include_directory : Separate<["--"], "include-directory">, Alias<I>;
 def _include_prefix_EQ : Joined<["--"], "include-prefix=">, Alias<iprefix>;
 def _include_prefix : Separate<["--"], "include-prefix">, Alias<iprefix>;
 def _include_with_prefix_after_EQ : Joined<["--"], "include-with-prefix-after=">, Alias<iwithprefix>;
 def _include_with_prefix_after : Separate<["--"], "include-with-prefix-after">, Alias<iwithprefix>;
 def _include_with_prefix_before_EQ : Joined<["--"], "include-with-prefix-before=">, Alias<iwithprefixbefore>;
 def _include_with_prefix_before : Separate<["--"], "include-with-prefix-before">, Alias<iwithprefixbefore>;
 def _include_with_prefix_EQ : Joined<["--"], "include-with-prefix=">, Alias<iwithprefix>;
 def _include_with_prefix : Separate<["--"], "include-with-prefix">, Alias<iwithprefix>;
 def _include_EQ : Joined<["--"], "include=">, Alias<include_>;
 def _language_EQ : Joined<["--"], "language=">, Alias<x>;
 def _language : Separate<["--"], "language">, Alias<x>;
 def _library_directory_EQ : Joined<["--"], "library-directory=">, Alias<L>;
 def _library_directory : Separate<["--"], "library-directory">, Alias<L>;
 def _no_line_commands : Flag<["--"], "no-line-commands">, Alias<P>;
 def _no_standard_includes : Flag<["--"], "no-standard-includes">, Alias<nostdinc>;
 def _no_standard_libraries : Flag<["--"], "no-standard-libraries">, Alias<nostdlib>;
 def _no_undefined : Flag<["--"], "no-undefined">, Flags<[LinkerInput]>;
 def _no_warnings : Flag<["--"], "no-warnings">, Alias<w>;
 def _optimize_EQ : Joined<["--"], "optimize=">, Alias<O>;
 def _optimize : Flag<["--"], "optimize">, Alias<O>;
 def _output_class_directory_EQ : Joined<["--"], "output-class-directory=">, Alias<foutput_class_dir_EQ>;
 def _output_class_directory : Separate<["--"], "output-class-directory">, Alias<foutput_class_dir_EQ>;
 def _output_EQ : Joined<["--"], "output=">, Alias<o>;
 def _output : Separate<["--"], "output">, Alias<o>;
 def _param : Separate<["--"], "param">, Group<CompileOnly_Group>;
 def _param_EQ : Joined<["--"], "param=">, Alias<_param>;
 def _precompile : Flag<["--"], "precompile">, Flags<[DriverOption]>,
   Group<Action_Group>, HelpText<"Only precompile the input">;
 def _prefix_EQ : Joined<["--"], "prefix=">, Alias<B>;
 def _prefix : Separate<["--"], "prefix">, Alias<B>;
 def _preprocess : Flag<["--"], "preprocess">, Alias<E>;
 def _print_diagnostic_categories : Flag<["--"], "print-diagnostic-categories">;
 def _print_file_name : Separate<["--"], "print-file-name">, Alias<print_file_name_EQ>;
 def _print_missing_file_dependencies : Flag<["--"], "print-missing-file-dependencies">, Alias<MG>;
 def _print_prog_name : Separate<["--"], "print-prog-name">, Alias<print_prog_name_EQ>;
 def _profile_blocks : Flag<["--"], "profile-blocks">, Alias<a>;
 def _profile : Flag<["--"], "profile">, Alias<p>;
 def _resource_EQ : Joined<["--"], "resource=">, Alias<fcompile_resource_EQ>;
 def _resource : Separate<["--"], "resource">, Alias<fcompile_resource_EQ>;
 def _rtlib : Separate<["--"], "rtlib">, Alias<rtlib_EQ>;
 def _serialize_diags : Separate<["-", "--"], "serialize-diagnostics">, Flags<[DriverOption]>,
   HelpText<"Serialize compiler diagnostics to a file">;
 // We give --version different semantics from -version.
 def _version : Flag<["--"], "version">, Flags<[CoreOption, CC1Option]>,
   HelpText<"Print version information">;
 def _signed_char : Flag<["--"], "signed-char">, Alias<fsigned_char>;
 def _std : Separate<["--"], "std">, Alias<std_EQ>;
 def _stdlib : Separate<["--"], "stdlib">, Alias<stdlib_EQ>;
 def _sysroot_EQ : Joined<["--"], "sysroot=">;
 def _sysroot : Separate<["--"], "sysroot">, Alias<_sysroot_EQ>;
 def _target_help : Flag<["--"], "target-help">;
 def _trace_includes : Flag<["--"], "trace-includes">, Alias<H>;
 def _undefine_macro_EQ : Joined<["--"], "undefine-macro=">, Alias<U>;
 def _undefine_macro : Separate<["--"], "undefine-macro">, Alias<U>;
 def _unsigned_char : Flag<["--"], "unsigned-char">, Alias<funsigned_char>;
 def _user_dependencies : Flag<["--"], "user-dependencies">, Alias<MM>;
 def _verbose : Flag<["--"], "verbose">, Alias<v>;
 def _warn__EQ : Joined<["--"], "warn-=">, Alias<W_Joined>;
 def _warn_ : Joined<["--"], "warn-">, Alias<W_Joined>;
 def _write_dependencies : Flag<["--"], "write-dependencies">, Alias<MD>;
 def _write_user_dependencies : Flag<["--"], "write-user-dependencies">, Alias<MMD>;
 def _ : Joined<["--"], "">, Flags<[Unsupported]>;
 
 def mieee_rnd_near : Flag<["-"], "mieee-rnd-near">, Group<m_hexagon_Features_Group>;
 def mv4 : Flag<["-"], "mv4">, Group<m_hexagon_Features_Group>,
           Alias<mcpu_EQ>, AliasArgs<["hexagonv4"]>;
 def mv5 : Flag<["-"], "mv5">, Group<m_hexagon_Features_Group>, Alias<mcpu_EQ>,
           AliasArgs<["hexagonv5"]>;
 def mv55 : Flag<["-"], "mv55">, Group<m_hexagon_Features_Group>,
            Alias<mcpu_EQ>, AliasArgs<["hexagonv55"]>;
 def mv60 : Flag<["-"], "mv60">, Group<m_hexagon_Features_Group>,
            Alias<mcpu_EQ>, AliasArgs<["hexagonv60"]>;
 def mv62 : Flag<["-"], "mv62">, Group<m_hexagon_Features_Group>,
            Alias<mcpu_EQ>, AliasArgs<["hexagonv62"]>;
 def mv65 : Flag<["-"], "mv65">, Group<m_hexagon_Features_Group>,
            Alias<mcpu_EQ>, AliasArgs<["hexagonv65"]>;
 def mhexagon_hvx : Flag<[ "-" ], "mhvx">,
                    Group<m_hexagon_Features_HVX_Group>,
                    HelpText<"Enable Hexagon Vector eXtensions">;
 def mhexagon_hvx_EQ : Joined<[ "-" ], "mhvx=">,
                      Group<m_hexagon_Features_HVX_Group>,
                      HelpText<"Enable Hexagon Vector eXtensions">;
 def mno_hexagon_hvx : Flag<[ "-" ], "mno-hvx">,
                       Group<m_hexagon_Features_HVX_Group>,
                       HelpText<"Disable Hexagon Vector eXtensions">;
 def mhexagon_hvx_length_EQ : Joined<[ "-" ], "mhvx-length=">,
                         Group<m_hexagon_Features_HVX_Group>,
                         HelpText<"Set Hexagon Vector Length">, Values<"64B,128B">;
 // hvx-double deprecrated flag.
 def mhexagon_hvx_double : Flag<[ "-" ], "mhvx-double">,
                           Group<m_hexagon_Features_HVX_Group>,
                           HelpText<"Enable Hexagon Double Vector eXtensions">;
 def mno_hexagon_hvx_double
     : Flag<[ "-" ], "mno-hvx-double">,
       Group<m_hexagon_Features_HVX_Group>,
       HelpText<"Disable Hexagon Double Vector eXtensions">;
 
 
 // X86 feature flags
 def mx87 : Flag<["-"], "mx87">, Group<m_x86_Features_Group>;
 def mno_x87 : Flag<["-"], "mno-x87">, Group<m_x86_Features_Group>;
 def m80387 : Flag<["-"], "m80387">, Alias<mx87>;
 def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
 def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
 def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
 def m3dnow : Flag<["-"], "m3dnow">, Group<m_x86_Features_Group>;
 def mno_3dnow : Flag<["-"], "mno-3dnow">, Group<m_x86_Features_Group>;
 def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
 def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
 def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
 def mno_sse : Flag<["-"], "mno-sse">, Group<m_x86_Features_Group>;
 def msse2 : Flag<["-"], "msse2">, Group<m_x86_Features_Group>;
 def mno_sse2 : Flag<["-"], "mno-sse2">, Group<m_x86_Features_Group>;
 def msse3 : Flag<["-"], "msse3">, Group<m_x86_Features_Group>;
 def mno_sse3 : Flag<["-"], "mno-sse3">, Group<m_x86_Features_Group>;
 def mssse3 : Flag<["-"], "mssse3">, Group<m_x86_Features_Group>;
 def mno_ssse3 : Flag<["-"], "mno-ssse3">, Group<m_x86_Features_Group>;
 def msse4_1 : Flag<["-"], "msse4.1">, Group<m_x86_Features_Group>;
 def mno_sse4_1 : Flag<["-"], "mno-sse4.1">, Group<m_x86_Features_Group>;
 def msse4_2 : Flag<["-"], "msse4.2">, Group<m_x86_Features_Group>;
 def mno_sse4_2 : Flag<["-"], "mno-sse4.2">, Group<m_x86_Features_Group>;
 def msse4 : Flag<["-"], "msse4">, Alias<msse4_2>;
 // -mno-sse4 turns off sse4.1 which has the effect of turning off everything
 // later than 4.1. -msse4 turns on 4.2 which has the effect of turning on
 // everything earlier than 4.2.
 def mno_sse4 : Flag<["-"], "mno-sse4">, Alias<mno_sse4_1>;
 def msse4a : Flag<["-"], "msse4a">, Group<m_x86_Features_Group>;
 def mno_sse4a : Flag<["-"], "mno-sse4a">, Group<m_x86_Features_Group>;
 def mavx : Flag<["-"], "mavx">, Group<m_x86_Features_Group>;
 def mno_avx : Flag<["-"], "mno-avx">, Group<m_x86_Features_Group>;
 def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
 def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
 def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
 def mno_avx512f : Flag<["-"], "mno-avx512f">, Group<m_x86_Features_Group>;
 def mavx512bitalg : Flag<["-"], "mavx512bitalg">, Group<m_x86_Features_Group>;
 def mno_avx512bitalg : Flag<["-"], "mno-avx512bitalg">, Group<m_x86_Features_Group>;
 def mavx512bw : Flag<["-"], "mavx512bw">, Group<m_x86_Features_Group>;
 def mno_avx512bw : Flag<["-"], "mno-avx512bw">, Group<m_x86_Features_Group>;
 def mavx512cd : Flag<["-"], "mavx512cd">, Group<m_x86_Features_Group>;
 def mno_avx512cd : Flag<["-"], "mno-avx512cd">, Group<m_x86_Features_Group>;
 def mavx512dq : Flag<["-"], "mavx512dq">, Group<m_x86_Features_Group>;
 def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
 def mavx512er : Flag<["-"], "mavx512er">, Group<m_x86_Features_Group>;
 def mno_avx512er : Flag<["-"], "mno-avx512er">, Group<m_x86_Features_Group>;
 def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
 def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
 def mavx512pf : Flag<["-"], "mavx512pf">, Group<m_x86_Features_Group>;
 def mno_avx512pf : Flag<["-"], "mno-avx512pf">, Group<m_x86_Features_Group>;
 def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group<m_x86_Features_Group>;
 def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group<m_x86_Features_Group>;
 def mavx512vbmi2 : Flag<["-"], "mavx512vbmi2">, Group<m_x86_Features_Group>;
 def mno_avx512vbmi2 : Flag<["-"], "mno-avx512vbmi2">, Group<m_x86_Features_Group>;
 def mavx512vl : Flag<["-"], "mavx512vl">, Group<m_x86_Features_Group>;
 def mno_avx512vl : Flag<["-"], "mno-avx512vl">, Group<m_x86_Features_Group>;
 def mavx512vnni : Flag<["-"], "mavx512vnni">, Group<m_x86_Features_Group>;
 def mno_avx512vnni : Flag<["-"], "mno-avx512vnni">, Group<m_x86_Features_Group>;
 def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Group>;
 def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
 def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
 def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
 def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;
 def mno_aes : Flag<["-"], "mno-aes">, Group<m_x86_Features_Group>;
 def mbmi : Flag<["-"], "mbmi">, Group<m_x86_Features_Group>;
 def mno_bmi : Flag<["-"], "mno-bmi">, Group<m_x86_Features_Group>;
 def mbmi2 : Flag<["-"], "mbmi2">, Group<m_x86_Features_Group>;
 def mno_bmi2 : Flag<["-"], "mno-bmi2">, Group<m_x86_Features_Group>;
 def mclflushopt : Flag<["-"], "mclflushopt">, Group<m_x86_Features_Group>;
 def mno_clflushopt : Flag<["-"], "mno-clflushopt">, Group<m_x86_Features_Group>;
 def mclwb : Flag<["-"], "mclwb">, Group<m_x86_Features_Group>;
 def mno_clwb : Flag<["-"], "mno-clwb">, Group<m_x86_Features_Group>;
 def mclzero : Flag<["-"], "mclzero">, Group<m_x86_Features_Group>;
 def mno_clzero : Flag<["-"], "mno-clzero">, Group<m_x86_Features_Group>;
 def mcx16 : Flag<["-"], "mcx16">, Group<m_x86_Features_Group>;
 def mno_cx16 : Flag<["-"], "mno-cx16">, Group<m_x86_Features_Group>;
 def mf16c : Flag<["-"], "mf16c">, Group<m_x86_Features_Group>;
 def mno_f16c : Flag<["-"], "mno-f16c">, Group<m_x86_Features_Group>;
 def mfma : Flag<["-"], "mfma">, Group<m_x86_Features_Group>;
 def mno_fma : Flag<["-"], "mno-fma">, Group<m_x86_Features_Group>;
 def mfma4 : Flag<["-"], "mfma4">, Group<m_x86_Features_Group>;
 def mno_fma4 : Flag<["-"], "mno-fma4">, Group<m_x86_Features_Group>;
 def mfsgsbase : Flag<["-"], "mfsgsbase">, Group<m_x86_Features_Group>;
 def mno_fsgsbase : Flag<["-"], "mno-fsgsbase">, Group<m_x86_Features_Group>;
 def mfxsr : Flag<["-"], "mfxsr">, Group<m_x86_Features_Group>;
 def mno_fxsr : Flag<["-"], "mno-fxsr">, Group<m_x86_Features_Group>;
 def mgfni : Flag<["-"], "mgfni">, Group<m_x86_Features_Group>;
 def mno_gfni : Flag<["-"], "mno-gfni">, Group<m_x86_Features_Group>;
 def mlwp : Flag<["-"], "mlwp">, Group<m_x86_Features_Group>;
 def mno_lwp : Flag<["-"], "mno-lwp">, Group<m_x86_Features_Group>;
 def mlzcnt : Flag<["-"], "mlzcnt">, Group<m_x86_Features_Group>;
 def mno_lzcnt : Flag<["-"], "mno-lzcnt">, Group<m_x86_Features_Group>;
 def mmovbe : Flag<["-"], "mmovbe">, Group<m_x86_Features_Group>;
 def mno_movbe : Flag<["-"], "mno-movbe">, Group<m_x86_Features_Group>;
 def mmpx : Flag<["-"], "mmpx">, Group<m_x86_Features_Group>;
 def mno_mpx : Flag<["-"], "mno-mpx">, Group<m_x86_Features_Group>;
 def mmwaitx : Flag<["-"], "mmwaitx">, Group<m_x86_Features_Group>;
 def mno_mwaitx : Flag<["-"], "mno-mwaitx">, Group<m_x86_Features_Group>;
 def mpku : Flag<["-"], "mpku">, Group<m_x86_Features_Group>;
 def mno_pku : Flag<["-"], "mno-pku">, Group<m_x86_Features_Group>;
 def mpclmul : Flag<["-"], "mpclmul">, Group<m_x86_Features_Group>;
 def mno_pclmul : Flag<["-"], "mno-pclmul">, Group<m_x86_Features_Group>;
 def mpopcnt : Flag<["-"], "mpopcnt">, Group<m_x86_Features_Group>;
 def mno_popcnt : Flag<["-"], "mno-popcnt">, Group<m_x86_Features_Group>;
 def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group<m_x86_Features_Group>;
 def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group<m_x86_Features_Group>;
 def mprfchw : Flag<["-"], "mprfchw">, Group<m_x86_Features_Group>;
 def mno_prfchw : Flag<["-"], "mno-prfchw">, Group<m_x86_Features_Group>;
 def mrdrnd : Flag<["-"], "mrdrnd">, Group<m_x86_Features_Group>;
 def mno_rdrnd : Flag<["-"], "mno-rdrnd">, Group<m_x86_Features_Group>;
 def mrtm : Flag<["-"], "mrtm">, Group<m_x86_Features_Group>;
 def mno_rtm : Flag<["-"], "mno-rtm">, Group<m_x86_Features_Group>;
 def mrdseed : Flag<["-"], "mrdseed">, Group<m_x86_Features_Group>;
 def mno_rdseed : Flag<["-"], "mno-rdseed">, Group<m_x86_Features_Group>;
 def msgx : Flag<["-"], "msgx">, Group<m_x86_Features_Group>;
 def mno_sgx : Flag<["-"], "mno-sgx">, Group<m_x86_Features_Group>;
 def msha : Flag<["-"], "msha">, Group<m_x86_Features_Group>;
 def mno_sha : Flag<["-"], "mno-sha">, Group<m_x86_Features_Group>;
 def mtbm : Flag<["-"], "mtbm">, Group<m_x86_Features_Group>;
 def mno_tbm : Flag<["-"], "mno-tbm">, Group<m_x86_Features_Group>;
 def mvaes : Flag<["-"], "mvaes">, Group<m_x86_Features_Group>;
 def mno_vaes : Flag<["-"], "mno-vaes">, Group<m_x86_Features_Group>;
 def mvpclmulqdq : Flag<["-"], "mvpclmulqdq">, Group<m_x86_Features_Group>;
 def mno_vpclmulqdq : Flag<["-"], "mno-vpclmulqdq">, Group<m_x86_Features_Group>;
 def mxop : Flag<["-"], "mxop">, Group<m_x86_Features_Group>;
 def mno_xop : Flag<["-"], "mno-xop">, Group<m_x86_Features_Group>;
 def mxsave : Flag<["-"], "mxsave">, Group<m_x86_Features_Group>;
 def mno_xsave : Flag<["-"], "mno-xsave">, Group<m_x86_Features_Group>;
 def mxsavec : Flag<["-"], "mxsavec">, Group<m_x86_Features_Group>;
 def mno_xsavec : Flag<["-"], "mno-xsavec">, Group<m_x86_Features_Group>;
 def mxsaveopt : Flag<["-"], "mxsaveopt">, Group<m_x86_Features_Group>;
 def mno_xsaveopt : Flag<["-"], "mno-xsaveopt">, Group<m_x86_Features_Group>;
 def mxsaves : Flag<["-"], "mxsaves">, Group<m_x86_Features_Group>;
 def mno_xsaves : Flag<["-"], "mno-xsaves">, Group<m_x86_Features_Group>;
 def mshstk : Flag<["-"], "mshstk">, Group<m_x86_Features_Group>;
 def mno_shstk : Flag<["-"], "mno-shstk">, Group<m_x86_Features_Group>;
 def mibt : Flag<["-"], "mibt">, Group<m_x86_Features_Group>;
 def mno_ibt : Flag<["-"], "mno-ibt">, Group<m_x86_Features_Group>;
 
 // These are legacy user-facing driver-level option spellings. They are always
 // aliases for options that are spelled using the more common Unix / GNU flag
 // style of double-dash and equals-joined flags.
 def gcc_toolchain_legacy_spelling : Separate<["-"], "gcc-toolchain">, Alias<gcc_toolchain>;
 def target_legacy_spelling : Separate<["-"], "target">, Alias<target>;
 
 // Special internal option to handle -Xlinker --no-demangle.
 def Z_Xlinker__no_demangle : Flag<["-"], "Z-Xlinker-no-demangle">,
     Flags<[Unsupported, NoArgumentUnused]>;
 
 // Special internal option to allow forwarding arbitrary arguments to linker.
 def Zlinker_input : Separate<["-"], "Zlinker-input">,
     Flags<[Unsupported, NoArgumentUnused]>;
 
 // Reserved library options.
 def Z_reserved_lib_stdcxx : Flag<["-"], "Z-reserved-lib-stdc++">,
     Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;
 def Z_reserved_lib_cckext : Flag<["-"], "Z-reserved-lib-cckext">,
     Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;
 
 // Ignored options
 // FIXME: multiclasess produce suffixes, not prefixes. This is fine for now
 // since it is only used in ignored options.
 multiclass BooleanFFlag<string name> {
   def _f : Flag<["-"], "f"#name>;
   def _fno : Flag<["-"], "fno-"#name>;
 }
 
 defm : BooleanFFlag<"keep-inline-functions">, Group<clang_ignored_gcc_optimization_f_Group>;
 
 def fprofile_dir : Joined<["-"], "fprofile-dir=">, Group<f_Group>;
 
 def fuse_ld_EQ : Joined<["-"], "fuse-ld=">, Group<f_Group>, Flags<[CoreOption]>;
 
 defm align_functions : BooleanFFlag<"align-functions">, Group<clang_ignored_gcc_optimization_f_Group>;
 def falign_functions_EQ : Joined<["-"], "falign-functions=">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm align_labels : BooleanFFlag<"align-labels">, Group<clang_ignored_gcc_optimization_f_Group>;
 def falign_labels_EQ : Joined<["-"], "falign-labels=">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm align_loops : BooleanFFlag<"align-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
 def falign_loops_EQ : Joined<["-"], "falign-loops=">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm align_jumps : BooleanFFlag<"align-jumps">, Group<clang_ignored_gcc_optimization_f_Group>;
 def falign_jumps_EQ : Joined<["-"], "falign-jumps=">, Group<clang_ignored_gcc_optimization_f_Group>;
 
 // FIXME: This option should be supported and wired up to our diognostics, but
 // ignore it for now to avoid breaking builds that use it.
 def fdiagnostics_show_location_EQ : Joined<["-"], "fdiagnostics-show-location=">, Group<clang_ignored_f_Group>;
 
 defm fcheck_new : BooleanFFlag<"check-new">, Group<clang_ignored_f_Group>;
 defm caller_saves : BooleanFFlag<"caller-saves">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm reorder_blocks : BooleanFFlag<"reorder-blocks">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm eliminate_unused_debug_types : BooleanFFlag<"eliminate-unused-debug-types">, Group<clang_ignored_f_Group>;
 defm branch_count_reg : BooleanFFlag<"branch-count-reg">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm default_inline : BooleanFFlag<"default-inline">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm delete_null_pointer_checks : BooleanFFlag<"delete-null-pointer-checks">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 defm fat_lto_objects : BooleanFFlag<"fat-lto-objects">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm float_store : BooleanFFlag<"float-store">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm friend_injection : BooleanFFlag<"friend-injection">, Group<clang_ignored_f_Group>;
 defm function_attribute_list : BooleanFFlag<"function-attribute-list">, Group<clang_ignored_f_Group>;
 defm gcse : BooleanFFlag<"gcse">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm gcse_after_reload: BooleanFFlag<"gcse-after-reload">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm gcse_las: BooleanFFlag<"gcse-las">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm gcse_sm: BooleanFFlag<"gcse-sm">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm gnu : BooleanFFlag<"gnu">, Group<clang_ignored_f_Group>;
 defm ident : BooleanFFlag<"ident">, Group<clang_ignored_f_Group>;
 defm implicit_templates : BooleanFFlag<"implicit-templates">, Group<clang_ignored_f_Group>;
 defm implement_inlines : BooleanFFlag<"implement-inlines">, Group<clang_ignored_f_Group>;
 defm merge_constants : BooleanFFlag<"merge-constants">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm modulo_sched : BooleanFFlag<"modulo-sched">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm modulo_sched_allow_regmoves : BooleanFFlag<"modulo-sched-allow-regmoves">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 defm inline_functions_called_once : BooleanFFlag<"inline-functions-called-once">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 def finline_limit_EQ : Joined<["-"], "finline-limit=">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm finline_limit : BooleanFFlag<"inline-limit">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm inline_small_functions : BooleanFFlag<"inline-small-functions">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 defm ipa_cp : BooleanFFlag<"ipa-cp">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 defm ivopts : BooleanFFlag<"ivopts">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm non_call_exceptions : BooleanFFlag<"non-call-exceptions">, Group<clang_ignored_f_Group>;
 defm peel_loops : BooleanFFlag<"peel-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm permissive : BooleanFFlag<"permissive">, Group<clang_ignored_f_Group>;
 defm prefetch_loop_arrays : BooleanFFlag<"prefetch-loop-arrays">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm printf : BooleanFFlag<"printf">, Group<clang_ignored_f_Group>;
 defm profile : BooleanFFlag<"profile">, Group<clang_ignored_f_Group>;
 defm profile_correction : BooleanFFlag<"profile-correction">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm profile_generate_sampling : BooleanFFlag<"profile-generate-sampling">, Group<clang_ignored_f_Group>;
 defm profile_reusedist : BooleanFFlag<"profile-reusedist">, Group<clang_ignored_f_Group>;
 defm profile_values : BooleanFFlag<"profile-values">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm regs_graph : BooleanFFlag<"regs-graph">, Group<clang_ignored_f_Group>;
 defm rename_registers : BooleanFFlag<"rename-registers">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm ripa : BooleanFFlag<"ripa">, Group<clang_ignored_f_Group>;
 defm rounding_math : BooleanFFlag<"rounding-math">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm schedule_insns : BooleanFFlag<"schedule-insns">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm schedule_insns2 : BooleanFFlag<"schedule-insns2">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm see : BooleanFFlag<"see">, Group<clang_ignored_f_Group>;
 defm signaling_nans : BooleanFFlag<"signaling-nans">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm single_precision_constant : BooleanFFlag<"single-precision-constant">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 defm spec_constr_count : BooleanFFlag<"spec-constr-count">, Group<clang_ignored_f_Group>;
 defm stack_check : BooleanFFlag<"stack-check">, Group<clang_ignored_f_Group>;
 defm strength_reduce :
     BooleanFFlag<"strength-reduce">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm tls_model : BooleanFFlag<"tls-model">, Group<clang_ignored_f_Group>;
 defm tracer : BooleanFFlag<"tracer">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm tree_dce : BooleanFFlag<"tree-dce">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm tree_loop_im : BooleanFFlag<"tree_loop_im">,  Group<clang_ignored_gcc_optimization_f_Group>;
 defm tree_loop_ivcanon : BooleanFFlag<"tree_loop_ivcanon">,  Group<clang_ignored_gcc_optimization_f_Group>;
 defm tree_loop_linear : BooleanFFlag<"tree_loop_linear">,  Group<clang_ignored_gcc_optimization_f_Group>;
 defm tree_salias : BooleanFFlag<"tree-salias">, Group<clang_ignored_f_Group>;
 defm tree_ter : BooleanFFlag<"tree-ter">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm tree_vectorizer_verbose : BooleanFFlag<"tree-vectorizer-verbose">, Group<clang_ignored_f_Group>;
 defm tree_vrp : BooleanFFlag<"tree-vrp">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm unroll_all_loops : BooleanFFlag<"unroll-all-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm unsafe_loop_optimizations : BooleanFFlag<"unsafe-loop-optimizations">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 defm unswitch_loops : BooleanFFlag<"unswitch-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm use_linker_plugin : BooleanFFlag<"use-linker-plugin">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm vect_cost_model : BooleanFFlag<"vect-cost-model">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm variable_expansion_in_unroller : BooleanFFlag<"variable-expansion-in-unroller">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 defm web : BooleanFFlag<"web">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm whole_program : BooleanFFlag<"whole-program">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm devirtualize : BooleanFFlag<"devirtualize">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm devirtualize_speculatively : BooleanFFlag<"devirtualize-speculatively">,
     Group<clang_ignored_gcc_optimization_f_Group>;
 
 // Generic gfortran options.
 def A_DASH : Joined<["-"], "A-">, Group<gfortran_Group>;
 def J : JoinedOrSeparate<["-"], "J">, Flags<[RenderJoined]>, Group<gfortran_Group>;
 def cpp : Flag<["-"], "cpp">, Group<gfortran_Group>;
 def nocpp : Flag<["-"], "nocpp">, Group<gfortran_Group>;
 def static_libgfortran : Flag<["-"], "static-libgfortran">, Group<gfortran_Group>;
 
 // "f" options with values for gfortran.
 def fblas_matmul_limit_EQ : Joined<["-"], "fblas-matmul-limit=">, Group<gfortran_Group>;
 def fcheck_EQ : Joined<["-"], "fcheck=">, Group<gfortran_Group>;
 def fcoarray_EQ : Joined<["-"], "fcoarray=">, Group<gfortran_Group>;
 def fconvert_EQ : Joined<["-"], "fconvert=">, Group<gfortran_Group>;
 def ffixed_line_length_VALUE : Joined<["-"], "ffixed-line-length-">, Group<gfortran_Group>;
 def ffpe_trap_EQ : Joined<["-"], "ffpe-trap=">, Group<gfortran_Group>;
 def ffree_line_length_VALUE : Joined<["-"], "ffree-line-length-">, Group<gfortran_Group>;
 def finit_character_EQ : Joined<["-"], "finit-character=">, Group<gfortran_Group>;
 def finit_integer_EQ : Joined<["-"], "finit-integer=">, Group<gfortran_Group>;
 def finit_logical_EQ : Joined<["-"], "finit-logical=">, Group<gfortran_Group>;
 def finit_real_EQ : Joined<["-"], "finit-real=">, Group<gfortran_Group>;
 def fmax_array_constructor_EQ : Joined<["-"], "fmax-array-constructor=">, Group<gfortran_Group>;
 def fmax_errors_EQ : Joined<["-"], "fmax-errors=">, Group<gfortran_Group>;
 def fmax_stack_var_size_EQ : Joined<["-"], "fmax-stack-var-size=">, Group<gfortran_Group>;
 def fmax_subrecord_length_EQ : Joined<["-"], "fmax-subrecord-length=">, Group<gfortran_Group>;
 def frecord_marker_EQ : Joined<["-"], "frecord-marker=">, Group<gfortran_Group>;
 
 // "f" flags for gfortran.
 defm aggressive_function_elimination : BooleanFFlag<"aggressive-function-elimination">, Group<gfortran_Group>;
 defm align_commons : BooleanFFlag<"align-commons">, Group<gfortran_Group>;
 defm all_intrinsics : BooleanFFlag<"all-intrinsics">, Group<gfortran_Group>;
 defm automatic : BooleanFFlag<"automatic">, Group<gfortran_Group>;
 defm backslash : BooleanFFlag<"backslash">, Group<gfortran_Group>;
 defm backtrace : BooleanFFlag<"backtrace">, Group<gfortran_Group>;
 defm bounds_check : BooleanFFlag<"bounds-check">, Group<gfortran_Group>;
 defm check_array_temporaries : BooleanFFlag<"check-array-temporaries">, Group<gfortran_Group>;
 defm cray_pointer : BooleanFFlag<"cray-pointer">, Group<gfortran_Group>;
 defm d_lines_as_code : BooleanFFlag<"d-lines-as-code">, Group<gfortran_Group>;
 defm d_lines_as_comments : BooleanFFlag<"d-lines-as-comments">, Group<gfortran_Group>;
 defm default_double_8 : BooleanFFlag<"default-double-8">, Group<gfortran_Group>;
 defm default_integer_8 : BooleanFFlag<"default-integer-8">, Group<gfortran_Group>;
 defm default_real_8 : BooleanFFlag<"default-real-8">, Group<gfortran_Group>;
 defm dollar_ok : BooleanFFlag<"dollar-ok">, Group<gfortran_Group>;
 defm dump_fortran_optimized : BooleanFFlag<"dump-fortran-optimized">, Group<gfortran_Group>;
 defm dump_fortran_original : BooleanFFlag<"dump-fortran-original">, Group<gfortran_Group>;
 defm dump_parse_tree : BooleanFFlag<"dump-parse-tree">, Group<gfortran_Group>;
 defm external_blas : BooleanFFlag<"external-blas">, Group<gfortran_Group>;
 defm f2c : BooleanFFlag<"f2c">, Group<gfortran_Group>;
 defm fixed_form : BooleanFFlag<"fixed-form">, Group<gfortran_Group>;
 defm free_form : BooleanFFlag<"free-form">, Group<gfortran_Group>;
 defm frontend_optimize : BooleanFFlag<"frontend-optimize">, Group<gfortran_Group>;
 defm implicit_none : BooleanFFlag<"implicit-none">, Group<gfortran_Group>;
 defm init_local_zero : BooleanFFlag<"init-local-zero">, Group<gfortran_Group>;
 defm integer_4_integer_8 : BooleanFFlag<"integer-4-integer-8">, Group<gfortran_Group>;
 defm intrinsic_modules_path : BooleanFFlag<"intrinsic-modules-path">, Group<gfortran_Group>;
 defm max_identifier_length : BooleanFFlag<"max-identifier-length">, Group<gfortran_Group>;
 defm module_private : BooleanFFlag<"module-private">, Group<gfortran_Group>;
 defm pack_derived : BooleanFFlag<"pack-derived">, Group<gfortran_Group>;
 defm protect_parens : BooleanFFlag<"protect-parens">, Group<gfortran_Group>;
 defm range_check : BooleanFFlag<"range-check">, Group<gfortran_Group>;
 defm real_4_real_10 : BooleanFFlag<"real-4-real-10">, Group<gfortran_Group>;
 defm real_4_real_16 : BooleanFFlag<"real-4-real-16">, Group<gfortran_Group>;
 defm real_4_real_8 : BooleanFFlag<"real-4-real-8">, Group<gfortran_Group>;
 defm real_8_real_10 : BooleanFFlag<"real-8-real-10">, Group<gfortran_Group>;
 defm real_8_real_16 : BooleanFFlag<"real-8-real-16">, Group<gfortran_Group>;
 defm real_8_real_4 : BooleanFFlag<"real-8-real-4">, Group<gfortran_Group>;
 defm realloc_lhs : BooleanFFlag<"realloc-lhs">, Group<gfortran_Group>;
 defm recursive : BooleanFFlag<"recursive">, Group<gfortran_Group>;
 defm repack_arrays : BooleanFFlag<"repack-arrays">, Group<gfortran_Group>;
 defm second_underscore : BooleanFFlag<"second-underscore">, Group<gfortran_Group>;
 defm sign_zero : BooleanFFlag<"sign-zero">, Group<gfortran_Group>;
 defm stack_arrays : BooleanFFlag<"stack-arrays">, Group<gfortran_Group>;
 defm underscoring : BooleanFFlag<"underscoring">, Group<gfortran_Group>;
 defm whole_file : BooleanFFlag<"whole-file">, Group<gfortran_Group>;
 
 
 include "CC1Options.td"
 
 include "CLCompatOptions.td"
Index: head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp	(revision 328752)
+++ head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Clang.cpp	(revision 328753)
@@ -1,5479 +1,5510 @@
 //===--- LLVM.cpp - Clang+LLVM ToolChain Implementations --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 #include "Clang.h"
 #include "Arch/AArch64.h"
 #include "Arch/ARM.h"
 #include "Arch/Mips.h"
 #include "Arch/PPC.h"
 #include "Arch/Sparc.h"
 #include "Arch/SystemZ.h"
 #include "Arch/X86.h"
 #include "AMDGPU.h"
 #include "CommonArgs.h"
 #include "Hexagon.h"
 #include "InputInfo.h"
 #include "PS4CPU.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/ObjCRuntime.h"
 #include "clang/Basic/Version.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/XRayArgs.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/YAMLParser.h"
 
 #ifdef LLVM_ON_UNIX
 #include <unistd.h> // For getuid().
 #endif
 
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
 static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
   if (Arg *A =
           Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC)) {
     if (!Args.hasArg(options::OPT_E) && !Args.hasArg(options::OPT__SLASH_P) &&
         !Args.hasArg(options::OPT__SLASH_EP) && !D.CCCIsCPP()) {
       D.Diag(clang::diag::err_drv_argument_only_allowed_with)
           << A->getBaseArg().getAsString(Args)
           << (D.IsCLMode() ? "/E, /P or /EP" : "-E");
     }
   }
 }
 
 static void CheckCodeGenerationOptions(const Driver &D, const ArgList &Args) {
   // In gcc, only ARM checks this, but it seems reasonable to check universally.
   if (Args.hasArg(options::OPT_static))
     if (const Arg *A =
             Args.getLastArg(options::OPT_dynamic, options::OPT_mdynamic_no_pic))
       D.Diag(diag::err_drv_argument_not_allowed_with) << A->getAsString(Args)
                                                       << "-static";
 }
 
 // Add backslashes to escape spaces and other backslashes.
 // This is used for the space-separated argument list specified with
 // the -dwarf-debug-flags option.
 static void EscapeSpacesAndBackslashes(const char *Arg,
                                        SmallVectorImpl<char> &Res) {
   for (; *Arg; ++Arg) {
     switch (*Arg) {
     default:
       break;
     case ' ':
     case '\\':
       Res.push_back('\\');
       break;
     }
     Res.push_back(*Arg);
   }
 }
 
 // Quote target names for inclusion in GNU Make dependency files.
 // Only the characters '$', '#', ' ', '\t' are quoted.
 static void QuoteTarget(StringRef Target, SmallVectorImpl<char> &Res) {
   for (unsigned i = 0, e = Target.size(); i != e; ++i) {
     switch (Target[i]) {
     case ' ':
     case '\t':
       // Escape the preceding backslashes
       for (int j = i - 1; j >= 0 && Target[j] == '\\'; --j)
         Res.push_back('\\');
 
       // Escape the space/tab
       Res.push_back('\\');
       break;
     case '$':
       Res.push_back('$');
       break;
     case '#':
       Res.push_back('\\');
       break;
     default:
       break;
     }
 
     Res.push_back(Target[i]);
   }
 }
 
 /// Apply \a Work on the current tool chain \a RegularToolChain and any other
 /// offloading tool chain that is associated with the current action \a JA.
 static void
 forAllAssociatedToolChains(Compilation &C, const JobAction &JA,
                            const ToolChain &RegularToolChain,
                            llvm::function_ref<void(const ToolChain &)> Work) {
   // Apply Work on the current/regular tool chain.
   Work(RegularToolChain);
 
   // Apply Work on all the offloading tool chains associated with the current
   // action.
   if (JA.isHostOffloading(Action::OFK_Cuda))
     Work(*C.getSingleOffloadToolChain<Action::OFK_Cuda>());
   else if (JA.isDeviceOffloading(Action::OFK_Cuda))
     Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
 
   if (JA.isHostOffloading(Action::OFK_OpenMP)) {
     auto TCs = C.getOffloadToolChains<Action::OFK_OpenMP>();
     for (auto II = TCs.first, IE = TCs.second; II != IE; ++II)
       Work(*II->second);
   } else if (JA.isDeviceOffloading(Action::OFK_OpenMP))
     Work(*C.getSingleOffloadToolChain<Action::OFK_Host>());
 
   //
   // TODO: Add support for other offloading programming models here.
   //
 }
 
 /// This is a helper function for validating the optional refinement step
 /// parameter in reciprocal argument strings. Return false if there is an error
 /// parsing the refinement step. Otherwise, return true and set the Position
 /// of the refinement step in the input string.
 static bool getRefinementStep(StringRef In, const Driver &D,
                               const Arg &A, size_t &Position) {
   const char RefinementStepToken = ':';
   Position = In.find(RefinementStepToken);
   if (Position != StringRef::npos) {
     StringRef Option = A.getOption().getName();
     StringRef RefStep = In.substr(Position + 1);
     // Allow exactly one numeric character for the additional refinement
     // step parameter. This is reasonable for all currently-supported
     // operations and architectures because we would expect that a larger value
     // of refinement steps would cause the estimate "optimization" to
     // under-perform the native operation. Also, if the estimate does not
     // converge quickly, it probably will not ever converge, so further
     // refinement steps will not produce a better answer.
     if (RefStep.size() != 1) {
       D.Diag(diag::err_drv_invalid_value) << Option << RefStep;
       return false;
     }
     char RefStepChar = RefStep[0];
     if (RefStepChar < '0' || RefStepChar > '9') {
       D.Diag(diag::err_drv_invalid_value) << Option << RefStep;
       return false;
     }
   }
   return true;
 }
 
 /// The -mrecip flag requires processing of many optional parameters.
 static void ParseMRecip(const Driver &D, const ArgList &Args,
                         ArgStringList &OutStrings) {
   StringRef DisabledPrefixIn = "!";
   StringRef DisabledPrefixOut = "!";
   StringRef EnabledPrefixOut = "";
   StringRef Out = "-mrecip=";
 
   Arg *A = Args.getLastArg(options::OPT_mrecip, options::OPT_mrecip_EQ);
   if (!A)
     return;
 
   unsigned NumOptions = A->getNumValues();
   if (NumOptions == 0) {
     // No option is the same as "all".
     OutStrings.push_back(Args.MakeArgString(Out + "all"));
     return;
   }
 
   // Pass through "all", "none", or "default" with an optional refinement step.
   if (NumOptions == 1) {
     StringRef Val = A->getValue(0);
     size_t RefStepLoc;
     if (!getRefinementStep(Val, D, *A, RefStepLoc))
       return;
     StringRef ValBase = Val.slice(0, RefStepLoc);
     if (ValBase == "all" || ValBase == "none" || ValBase == "default") {
       OutStrings.push_back(Args.MakeArgString(Out + Val));
       return;
     }
   }
 
   // Each reciprocal type may be enabled or disabled individually.
   // Check each input value for validity, concatenate them all back together,
   // and pass through.
 
   llvm::StringMap<bool> OptionStrings;
   OptionStrings.insert(std::make_pair("divd", false));
   OptionStrings.insert(std::make_pair("divf", false));
   OptionStrings.insert(std::make_pair("vec-divd", false));
   OptionStrings.insert(std::make_pair("vec-divf", false));
   OptionStrings.insert(std::make_pair("sqrtd", false));
   OptionStrings.insert(std::make_pair("sqrtf", false));
   OptionStrings.insert(std::make_pair("vec-sqrtd", false));
   OptionStrings.insert(std::make_pair("vec-sqrtf", false));
 
   for (unsigned i = 0; i != NumOptions; ++i) {
     StringRef Val = A->getValue(i);
 
     bool IsDisabled = Val.startswith(DisabledPrefixIn);
     // Ignore the disablement token for string matching.
     if (IsDisabled)
       Val = Val.substr(1);
 
     size_t RefStep;
     if (!getRefinementStep(Val, D, *A, RefStep))
       return;
 
     StringRef ValBase = Val.slice(0, RefStep);
     llvm::StringMap<bool>::iterator OptionIter = OptionStrings.find(ValBase);
     if (OptionIter == OptionStrings.end()) {
       // Try again specifying float suffix.
       OptionIter = OptionStrings.find(ValBase.str() + 'f');
       if (OptionIter == OptionStrings.end()) {
         // The input name did not match any known option string.
         D.Diag(diag::err_drv_unknown_argument) << Val;
         return;
       }
       // The option was specified without a float or double suffix.
       // Make sure that the double entry was not already specified.
       // The float entry will be checked below.
       if (OptionStrings[ValBase.str() + 'd']) {
         D.Diag(diag::err_drv_invalid_value) << A->getOption().getName() << Val;
         return;
       }
     }
 
     if (OptionIter->second == true) {
       // Duplicate option specified.
       D.Diag(diag::err_drv_invalid_value) << A->getOption().getName() << Val;
       return;
     }
 
     // Mark the matched option as found. Do not allow duplicate specifiers.
     OptionIter->second = true;
 
     // If the precision was not specified, also mark the double entry as found.
     if (ValBase.back() != 'f' && ValBase.back() != 'd')
       OptionStrings[ValBase.str() + 'd'] = true;
 
     // Build the output string.
     StringRef Prefix = IsDisabled ? DisabledPrefixOut : EnabledPrefixOut;
     Out = Args.MakeArgString(Out + Prefix + Val);
     if (i != NumOptions - 1)
       Out = Args.MakeArgString(Out + ",");
   }
 
   OutStrings.push_back(Args.MakeArgString(Out));
 }
 
 /// The -mprefer-vector-width option accepts either a positive integer
 /// or the string "none".
 static void ParseMPreferVectorWidth(const Driver &D, const ArgList &Args,
                                     ArgStringList &CmdArgs) {
   Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ);
   if (!A)
     return;
 
   StringRef Value = A->getValue();
   if (Value == "none") {
     CmdArgs.push_back("-mprefer-vector-width=none");
   } else {
     unsigned Width;
     if (Value.getAsInteger(10, Width)) {
       D.Diag(diag::err_drv_invalid_value) << A->getOption().getName() << Value;
       return;
     }
     CmdArgs.push_back(Args.MakeArgString("-mprefer-vector-width=" + Value));
   }
 }
 
 static void getWebAssemblyTargetFeatures(const ArgList &Args,
                                          std::vector<StringRef> &Features) {
   handleTargetFeaturesGroup(Args, Features, options::OPT_m_wasm_Features_Group);
 }
 
 static void getTargetFeatures(const ToolChain &TC, const llvm::Triple &Triple,
                               const ArgList &Args, ArgStringList &CmdArgs,
                               bool ForAS) {
   const Driver &D = TC.getDriver();
   std::vector<StringRef> Features;
   switch (Triple.getArch()) {
   default:
     break;
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
     mips::getMIPSTargetFeatures(D, Triple, Args, Features);
     break;
 
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
     arm::getARMTargetFeatures(TC, Triple, Args, CmdArgs, Features, ForAS);
     break;
 
   case llvm::Triple::ppc:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
     ppc::getPPCTargetFeatures(D, Triple, Args, Features);
     break;
   case llvm::Triple::systemz:
     systemz::getSystemZTargetFeatures(Args, Features);
     break;
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be:
     aarch64::getAArch64TargetFeatures(D, Args, Features);
     break;
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     x86::getX86TargetFeatures(D, Triple, Args, Features);
     break;
   case llvm::Triple::hexagon:
     hexagon::getHexagonTargetFeatures(D, Args, Features);
     break;
   case llvm::Triple::wasm32:
   case llvm::Triple::wasm64:
     getWebAssemblyTargetFeatures(Args, Features);
     break;
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
   case llvm::Triple::sparcv9:
     sparc::getSparcTargetFeatures(D, Args, Features);
     break;
   case llvm::Triple::r600:
   case llvm::Triple::amdgcn:
     amdgpu::getAMDGPUTargetFeatures(D, Args, Features);
     break;
   }
 
   // Find the last of each feature.
   llvm::StringMap<unsigned> LastOpt;
   for (unsigned I = 0, N = Features.size(); I < N; ++I) {
     StringRef Name = Features[I];
     assert(Name[0] == '-' || Name[0] == '+');
     LastOpt[Name.drop_front(1)] = I;
   }
 
   for (unsigned I = 0, N = Features.size(); I < N; ++I) {
     // If this feature was overridden, ignore it.
     StringRef Name = Features[I];
     llvm::StringMap<unsigned>::iterator LastI = LastOpt.find(Name.drop_front(1));
     assert(LastI != LastOpt.end());
     unsigned Last = LastI->second;
     if (Last != I)
       continue;
 
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back(Name.data());
   }
 }
 
 static bool
 shouldUseExceptionTablesForObjCExceptions(const ObjCRuntime &runtime,
                                           const llvm::Triple &Triple) {
   // We use the zero-cost exception tables for Objective-C if the non-fragile
   // ABI is enabled or when compiling for x86_64 and ARM on Snow Leopard and
   // later.
   if (runtime.isNonFragile())
     return true;
 
   if (!Triple.isMacOSX())
     return false;
 
   return (!Triple.isMacOSXVersionLT(10, 5) &&
           (Triple.getArch() == llvm::Triple::x86_64 ||
            Triple.getArch() == llvm::Triple::arm));
 }
 
 /// Adds exception related arguments to the driver command arguments. There's a
 /// master flag, -fexceptions and also language specific flags to enable/disable
 /// C++ and Objective-C exceptions. This makes it possible to for example
 /// disable C++ exceptions but enable Objective-C exceptions.
 static void addExceptionArgs(const ArgList &Args, types::ID InputType,
                              const ToolChain &TC, bool KernelOrKext,
                              const ObjCRuntime &objcRuntime,
                              ArgStringList &CmdArgs) {
   const Driver &D = TC.getDriver();
   const llvm::Triple &Triple = TC.getTriple();
 
   if (KernelOrKext) {
     // -mkernel and -fapple-kext imply no exceptions, so claim exception related
     // arguments now to avoid warnings about unused arguments.
     Args.ClaimAllArgs(options::OPT_fexceptions);
     Args.ClaimAllArgs(options::OPT_fno_exceptions);
     Args.ClaimAllArgs(options::OPT_fobjc_exceptions);
     Args.ClaimAllArgs(options::OPT_fno_objc_exceptions);
     Args.ClaimAllArgs(options::OPT_fcxx_exceptions);
     Args.ClaimAllArgs(options::OPT_fno_cxx_exceptions);
     return;
   }
 
   // See if the user explicitly enabled exceptions.
   bool EH = Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions,
                          false);
 
   // Obj-C exceptions are enabled by default, regardless of -fexceptions. This
   // is not necessarily sensible, but follows GCC.
   if (types::isObjC(InputType) &&
       Args.hasFlag(options::OPT_fobjc_exceptions,
                    options::OPT_fno_objc_exceptions, true)) {
     CmdArgs.push_back("-fobjc-exceptions");
 
     EH |= shouldUseExceptionTablesForObjCExceptions(objcRuntime, Triple);
   }
 
   if (types::isCXX(InputType)) {
     // Disable C++ EH by default on XCore and PS4.
     bool CXXExceptionsEnabled =
         Triple.getArch() != llvm::Triple::xcore && !Triple.isPS4CPU();
     Arg *ExceptionArg = Args.getLastArg(
         options::OPT_fcxx_exceptions, options::OPT_fno_cxx_exceptions,
         options::OPT_fexceptions, options::OPT_fno_exceptions);
     if (ExceptionArg)
       CXXExceptionsEnabled =
           ExceptionArg->getOption().matches(options::OPT_fcxx_exceptions) ||
           ExceptionArg->getOption().matches(options::OPT_fexceptions);
 
     if (CXXExceptionsEnabled) {
       if (Triple.isPS4CPU()) {
         ToolChain::RTTIMode RTTIMode = TC.getRTTIMode();
         assert(ExceptionArg &&
                "On the PS4 exceptions should only be enabled if passing "
                "an argument");
         if (RTTIMode == ToolChain::RM_DisabledExplicitly) {
           const Arg *RTTIArg = TC.getRTTIArg();
           assert(RTTIArg && "RTTI disabled explicitly but no RTTIArg!");
           D.Diag(diag::err_drv_argument_not_allowed_with)
               << RTTIArg->getAsString(Args) << ExceptionArg->getAsString(Args);
         } else if (RTTIMode == ToolChain::RM_EnabledImplicitly)
           D.Diag(diag::warn_drv_enabling_rtti_with_exceptions);
       } else
         assert(TC.getRTTIMode() != ToolChain::RM_DisabledImplicitly);
 
       CmdArgs.push_back("-fcxx-exceptions");
 
       EH = true;
     }
   }
 
   if (EH)
     CmdArgs.push_back("-fexceptions");
 }
 
 static bool ShouldDisableAutolink(const ArgList &Args, const ToolChain &TC) {
   bool Default = true;
   if (TC.getTriple().isOSDarwin()) {
     // The native darwin assembler doesn't support the linker_option directives,
     // so we disable them if we think the .s file will be passed to it.
     Default = TC.useIntegratedAs();
   }
   return !Args.hasFlag(options::OPT_fautolink, options::OPT_fno_autolink,
                        Default);
 }
 
 static bool ShouldDisableDwarfDirectory(const ArgList &Args,
                                         const ToolChain &TC) {
   bool UseDwarfDirectory =
       Args.hasFlag(options::OPT_fdwarf_directory_asm,
                    options::OPT_fno_dwarf_directory_asm, TC.useIntegratedAs());
   return !UseDwarfDirectory;
 }
 
 // Convert an arg of the form "-gN" or "-ggdbN" or one of their aliases
 // to the corresponding DebugInfoKind.
 static codegenoptions::DebugInfoKind DebugLevelToInfoKind(const Arg &A) {
   assert(A.getOption().matches(options::OPT_gN_Group) &&
          "Not a -g option that specifies a debug-info level");
   if (A.getOption().matches(options::OPT_g0) ||
       A.getOption().matches(options::OPT_ggdb0))
     return codegenoptions::NoDebugInfo;
   if (A.getOption().matches(options::OPT_gline_tables_only) ||
       A.getOption().matches(options::OPT_ggdb1))
     return codegenoptions::DebugLineTablesOnly;
   return codegenoptions::LimitedDebugInfo;
 }
 
 static bool mustUseNonLeafFramePointerForTarget(const llvm::Triple &Triple) {
   switch (Triple.getArch()){
   default:
     return false;
   case llvm::Triple::arm:
   case llvm::Triple::thumb:
     // ARM Darwin targets require a frame pointer to be always present to aid
     // offline debugging via backtraces.
     return Triple.isOSDarwin();
   }
 }
 
 static bool useFramePointerForTargetByDefault(const ArgList &Args,
                                               const llvm::Triple &Triple) {
   switch (Triple.getArch()) {
   case llvm::Triple::xcore:
   case llvm::Triple::wasm32:
   case llvm::Triple::wasm64:
     // XCore never wants frame pointers, regardless of OS.
     // WebAssembly never wants frame pointers.
     return false;
   default:
     break;
   }
 
   if (Triple.isOSLinux() || Triple.getOS() == llvm::Triple::CloudABI) {
     switch (Triple.getArch()) {
     // Don't use a frame pointer on linux if optimizing for certain targets.
     case llvm::Triple::mips64:
     case llvm::Triple::mips64el:
     case llvm::Triple::mips:
     case llvm::Triple::mipsel:
     case llvm::Triple::ppc:
     case llvm::Triple::ppc64:
     case llvm::Triple::ppc64le:
     case llvm::Triple::systemz:
     case llvm::Triple::x86:
     case llvm::Triple::x86_64:
       return !areOptimizationsEnabled(Args);
     default:
       return true;
     }
   }
 
   if (Triple.isOSWindows()) {
     switch (Triple.getArch()) {
     case llvm::Triple::x86:
       return !areOptimizationsEnabled(Args);
     case llvm::Triple::x86_64:
       return Triple.isOSBinFormatMachO();
     case llvm::Triple::arm:
     case llvm::Triple::thumb:
       // Windows on ARM builds with FPO disabled to aid fast stack walking
       return true;
     default:
       // All other supported Windows ISAs use xdata unwind information, so frame
       // pointers are not generally useful.
       return false;
     }
   }
 
   return true;
 }
 
 static bool shouldUseFramePointer(const ArgList &Args,
                                   const llvm::Triple &Triple) {
   if (Arg *A = Args.getLastArg(options::OPT_fno_omit_frame_pointer,
                                options::OPT_fomit_frame_pointer))
     return A->getOption().matches(options::OPT_fno_omit_frame_pointer) ||
            mustUseNonLeafFramePointerForTarget(Triple);
 
   if (Args.hasArg(options::OPT_pg))
     return true;
 
   return useFramePointerForTargetByDefault(Args, Triple);
 }
 
 static bool shouldUseLeafFramePointer(const ArgList &Args,
                                       const llvm::Triple &Triple) {
   if (Arg *A = Args.getLastArg(options::OPT_mno_omit_leaf_frame_pointer,
                                options::OPT_momit_leaf_frame_pointer))
     return A->getOption().matches(options::OPT_mno_omit_leaf_frame_pointer);
 
   if (Args.hasArg(options::OPT_pg))
     return true;
 
   if (Triple.isPS4CPU())
     return false;
 
   return useFramePointerForTargetByDefault(Args, Triple);
 }
 
 /// Add a CC1 option to specify the debug compilation directory.
 static void addDebugCompDirArg(const ArgList &Args, ArgStringList &CmdArgs) {
   SmallString<128> cwd;
   if (!llvm::sys::fs::current_path(cwd)) {
     CmdArgs.push_back("-fdebug-compilation-dir");
     CmdArgs.push_back(Args.MakeArgString(cwd));
   }
 }
 
 /// \brief Vectorize at all optimization levels greater than 1 except for -Oz.
 /// For -Oz the loop vectorizer is disable, while the slp vectorizer is enabled.
 static bool shouldEnableVectorizerAtOLevel(const ArgList &Args, bool isSlpVec) {
   if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
     if (A->getOption().matches(options::OPT_O4) ||
         A->getOption().matches(options::OPT_Ofast))
       return true;
 
     if (A->getOption().matches(options::OPT_O0))
       return false;
 
     assert(A->getOption().matches(options::OPT_O) && "Must have a -O flag");
 
     // Vectorize -Os.
     StringRef S(A->getValue());
     if (S == "s")
       return true;
 
     // Don't vectorize -Oz, unless it's the slp vectorizer.
     if (S == "z")
       return isSlpVec;
 
     unsigned OptLevel = 0;
     if (S.getAsInteger(10, OptLevel))
       return false;
 
     return OptLevel > 1;
   }
 
   return false;
 }
 
 /// Add -x lang to \p CmdArgs for \p Input.
 static void addDashXForInput(const ArgList &Args, const InputInfo &Input,
                              ArgStringList &CmdArgs) {
   // When using -verify-pch, we don't want to provide the type
   // 'precompiled-header' if it was inferred from the file extension
   if (Args.hasArg(options::OPT_verify_pch) && Input.getType() == types::TY_PCH)
     return;
 
   CmdArgs.push_back("-x");
   if (Args.hasArg(options::OPT_rewrite_objc))
     CmdArgs.push_back(types::getTypeName(types::TY_PP_ObjCXX));
   else {
     // Map the driver type to the frontend type. This is mostly an identity
     // mapping, except that the distinction between module interface units
     // and other source files does not exist at the frontend layer.
     const char *ClangType;
     switch (Input.getType()) {
     case types::TY_CXXModule:
       ClangType = "c++";
       break;
     case types::TY_PP_CXXModule:
       ClangType = "c++-cpp-output";
       break;
     default:
       ClangType = types::getTypeName(Input.getType());
       break;
     }
     CmdArgs.push_back(ClangType);
   }
 }
 
 static void appendUserToPath(SmallVectorImpl<char> &Result) {
 #ifdef LLVM_ON_UNIX
   const char *Username = getenv("LOGNAME");
 #else
   const char *Username = getenv("USERNAME");
 #endif
   if (Username) {
     // Validate that LoginName can be used in a path, and get its length.
     size_t Len = 0;
     for (const char *P = Username; *P; ++P, ++Len) {
       if (!clang::isAlphanumeric(*P) && *P != '_') {
         Username = nullptr;
         break;
       }
     }
 
     if (Username && Len > 0) {
       Result.append(Username, Username + Len);
       return;
     }
   }
 
 // Fallback to user id.
 #ifdef LLVM_ON_UNIX
   std::string UID = llvm::utostr(getuid());
 #else
   // FIXME: Windows seems to have an 'SID' that might work.
   std::string UID = "9999";
 #endif
   Result.append(UID.begin(), UID.end());
 }
 
 static void addPGOAndCoverageFlags(Compilation &C, const Driver &D,
                                    const InputInfo &Output, const ArgList &Args,
                                    ArgStringList &CmdArgs) {
 
   auto *PGOGenerateArg = Args.getLastArg(options::OPT_fprofile_generate,
                                          options::OPT_fprofile_generate_EQ,
                                          options::OPT_fno_profile_generate);
   if (PGOGenerateArg &&
       PGOGenerateArg->getOption().matches(options::OPT_fno_profile_generate))
     PGOGenerateArg = nullptr;
 
   auto *ProfileGenerateArg = Args.getLastArg(
       options::OPT_fprofile_instr_generate,
       options::OPT_fprofile_instr_generate_EQ,
       options::OPT_fno_profile_instr_generate);
   if (ProfileGenerateArg &&
       ProfileGenerateArg->getOption().matches(
           options::OPT_fno_profile_instr_generate))
     ProfileGenerateArg = nullptr;
 
   if (PGOGenerateArg && ProfileGenerateArg)
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << PGOGenerateArg->getSpelling() << ProfileGenerateArg->getSpelling();
 
   auto *ProfileUseArg = getLastProfileUseArg(Args);
 
   if (PGOGenerateArg && ProfileUseArg)
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << ProfileUseArg->getSpelling() << PGOGenerateArg->getSpelling();
 
   if (ProfileGenerateArg && ProfileUseArg)
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << ProfileGenerateArg->getSpelling() << ProfileUseArg->getSpelling();
 
   if (ProfileGenerateArg) {
     if (ProfileGenerateArg->getOption().matches(
             options::OPT_fprofile_instr_generate_EQ))
       CmdArgs.push_back(Args.MakeArgString(Twine("-fprofile-instrument-path=") +
                                            ProfileGenerateArg->getValue()));
     // The default is to use Clang Instrumentation.
     CmdArgs.push_back("-fprofile-instrument=clang");
   }
 
   if (PGOGenerateArg) {
     CmdArgs.push_back("-fprofile-instrument=llvm");
     if (PGOGenerateArg->getOption().matches(
             options::OPT_fprofile_generate_EQ)) {
       SmallString<128> Path(PGOGenerateArg->getValue());
       llvm::sys::path::append(Path, "default_%m.profraw");
       CmdArgs.push_back(
           Args.MakeArgString(Twine("-fprofile-instrument-path=") + Path));
     }
   }
 
   if (ProfileUseArg) {
     if (ProfileUseArg->getOption().matches(options::OPT_fprofile_instr_use_EQ))
       CmdArgs.push_back(Args.MakeArgString(
           Twine("-fprofile-instrument-use-path=") + ProfileUseArg->getValue()));
     else if ((ProfileUseArg->getOption().matches(
                   options::OPT_fprofile_use_EQ) ||
               ProfileUseArg->getOption().matches(
                   options::OPT_fprofile_instr_use))) {
       SmallString<128> Path(
           ProfileUseArg->getNumValues() == 0 ? "" : ProfileUseArg->getValue());
       if (Path.empty() || llvm::sys::fs::is_directory(Path))
         llvm::sys::path::append(Path, "default.profdata");
       CmdArgs.push_back(
           Args.MakeArgString(Twine("-fprofile-instrument-use-path=") + Path));
     }
   }
 
   if (Args.hasArg(options::OPT_ftest_coverage) ||
       Args.hasArg(options::OPT_coverage))
     CmdArgs.push_back("-femit-coverage-notes");
   if (Args.hasFlag(options::OPT_fprofile_arcs, options::OPT_fno_profile_arcs,
                    false) ||
       Args.hasArg(options::OPT_coverage))
     CmdArgs.push_back("-femit-coverage-data");
 
   if (Args.hasFlag(options::OPT_fcoverage_mapping,
                    options::OPT_fno_coverage_mapping, false)) {
     if (!ProfileGenerateArg)
       D.Diag(clang::diag::err_drv_argument_only_allowed_with)
           << "-fcoverage-mapping"
           << "-fprofile-instr-generate";
 
     CmdArgs.push_back("-fcoverage-mapping");
   }
 
   if (C.getArgs().hasArg(options::OPT_c) ||
       C.getArgs().hasArg(options::OPT_S)) {
     if (Output.isFilename()) {
       CmdArgs.push_back("-coverage-notes-file");
       SmallString<128> OutputFilename;
       if (Arg *FinalOutput = C.getArgs().getLastArg(options::OPT_o))
         OutputFilename = FinalOutput->getValue();
       else
         OutputFilename = llvm::sys::path::filename(Output.getBaseInput());
       SmallString<128> CoverageFilename = OutputFilename;
       if (llvm::sys::path::is_relative(CoverageFilename)) {
         SmallString<128> Pwd;
         if (!llvm::sys::fs::current_path(Pwd)) {
           llvm::sys::path::append(Pwd, CoverageFilename);
           CoverageFilename.swap(Pwd);
         }
       }
       llvm::sys::path::replace_extension(CoverageFilename, "gcno");
       CmdArgs.push_back(Args.MakeArgString(CoverageFilename));
 
       // Leave -fprofile-dir= an unused argument unless .gcda emission is
       // enabled. To be polite, with '-fprofile-arcs -fno-profile-arcs' consider
       // the flag used. There is no -fno-profile-dir, so the user has no
       // targeted way to suppress the warning.
       if (Args.hasArg(options::OPT_fprofile_arcs) ||
           Args.hasArg(options::OPT_coverage)) {
         CmdArgs.push_back("-coverage-data-file");
         if (Arg *FProfileDir = Args.getLastArg(options::OPT_fprofile_dir)) {
           CoverageFilename = FProfileDir->getValue();
           llvm::sys::path::append(CoverageFilename, OutputFilename);
         }
         llvm::sys::path::replace_extension(CoverageFilename, "gcda");
         CmdArgs.push_back(Args.MakeArgString(CoverageFilename));
       }
     }
   }
 }
 
 /// \brief Check whether the given input tree contains any compilation actions.
 static bool ContainsCompileAction(const Action *A) {
   if (isa<CompileJobAction>(A) || isa<BackendJobAction>(A))
     return true;
 
   for (const auto &AI : A->inputs())
     if (ContainsCompileAction(AI))
       return true;
 
   return false;
 }
 
 /// \brief Check if -relax-all should be passed to the internal assembler.
 /// This is done by default when compiling non-assembler source with -O0.
 static bool UseRelaxAll(Compilation &C, const ArgList &Args) {
   bool RelaxDefault = true;
 
   if (Arg *A = Args.getLastArg(options::OPT_O_Group))
     RelaxDefault = A->getOption().matches(options::OPT_O0);
 
   if (RelaxDefault) {
     RelaxDefault = false;
     for (const auto &Act : C.getActions()) {
       if (ContainsCompileAction(Act)) {
         RelaxDefault = true;
         break;
       }
     }
   }
 
   return Args.hasFlag(options::OPT_mrelax_all, options::OPT_mno_relax_all,
                       RelaxDefault);
 }
 
 // Extract the integer N from a string spelled "-dwarf-N", returning 0
 // on mismatch. The StringRef input (rather than an Arg) allows
 // for use by the "-Xassembler" option parser.
 static unsigned DwarfVersionNum(StringRef ArgValue) {
   return llvm::StringSwitch<unsigned>(ArgValue)
       .Case("-gdwarf-2", 2)
       .Case("-gdwarf-3", 3)
       .Case("-gdwarf-4", 4)
       .Case("-gdwarf-5", 5)
       .Default(0);
 }
 
 static void RenderDebugEnablingArgs(const ArgList &Args, ArgStringList &CmdArgs,
                                     codegenoptions::DebugInfoKind DebugInfoKind,
                                     unsigned DwarfVersion,
                                     llvm::DebuggerKind DebuggerTuning) {
   switch (DebugInfoKind) {
   case codegenoptions::DebugLineTablesOnly:
     CmdArgs.push_back("-debug-info-kind=line-tables-only");
     break;
   case codegenoptions::LimitedDebugInfo:
     CmdArgs.push_back("-debug-info-kind=limited");
     break;
   case codegenoptions::FullDebugInfo:
     CmdArgs.push_back("-debug-info-kind=standalone");
     break;
   default:
     break;
   }
   if (DwarfVersion > 0)
     CmdArgs.push_back(
         Args.MakeArgString("-dwarf-version=" + Twine(DwarfVersion)));
   switch (DebuggerTuning) {
   case llvm::DebuggerKind::GDB:
     CmdArgs.push_back("-debugger-tuning=gdb");
     break;
   case llvm::DebuggerKind::LLDB:
     CmdArgs.push_back("-debugger-tuning=lldb");
     break;
   case llvm::DebuggerKind::SCE:
     CmdArgs.push_back("-debugger-tuning=sce");
     break;
   default:
     break;
   }
 }
 
 static void RenderDebugInfoCompressionArgs(const ArgList &Args,
                                            ArgStringList &CmdArgs,
                                            const Driver &D) {
   const Arg *A = Args.getLastArg(options::OPT_gz, options::OPT_gz_EQ);
   if (!A)
     return;
 
   if (A->getOption().getID() == options::OPT_gz) {
     if (llvm::zlib::isAvailable())
       CmdArgs.push_back("-compress-debug-sections");
     else
       D.Diag(diag::warn_debug_compression_unavailable);
     return;
   }
 
   StringRef Value = A->getValue();
   if (Value == "none") {
     CmdArgs.push_back("-compress-debug-sections=none");
   } else if (Value == "zlib" || Value == "zlib-gnu") {
     if (llvm::zlib::isAvailable()) {
       CmdArgs.push_back(
           Args.MakeArgString("-compress-debug-sections=" + Twine(Value)));
     } else {
       D.Diag(diag::warn_debug_compression_unavailable);
     }
   } else {
     D.Diag(diag::err_drv_unsupported_option_argument)
         << A->getOption().getName() << Value;
   }
 }
 
 static const char *RelocationModelName(llvm::Reloc::Model Model) {
   switch (Model) {
   case llvm::Reloc::Static:
     return "static";
   case llvm::Reloc::PIC_:
     return "pic";
   case llvm::Reloc::DynamicNoPIC:
     return "dynamic-no-pic";
   case llvm::Reloc::ROPI:
     return "ropi";
   case llvm::Reloc::RWPI:
     return "rwpi";
   case llvm::Reloc::ROPI_RWPI:
     return "ropi-rwpi";
   }
   llvm_unreachable("Unknown Reloc::Model kind");
 }
 
 void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
                                     const Driver &D, const ArgList &Args,
                                     ArgStringList &CmdArgs,
                                     const InputInfo &Output,
                                     const InputInfoList &Inputs) const {
   Arg *A;
   const bool IsIAMCU = getToolChain().getTriple().isOSIAMCU();
 
   CheckPreprocessingOptions(D, Args);
 
   Args.AddLastArg(CmdArgs, options::OPT_C);
   Args.AddLastArg(CmdArgs, options::OPT_CC);
 
   // Handle dependency file generation.
   if ((A = Args.getLastArg(options::OPT_M, options::OPT_MM)) ||
       (A = Args.getLastArg(options::OPT_MD)) ||
       (A = Args.getLastArg(options::OPT_MMD))) {
     // Determine the output location.
     const char *DepFile;
     if (Arg *MF = Args.getLastArg(options::OPT_MF)) {
       DepFile = MF->getValue();
       C.addFailureResultFile(DepFile, &JA);
     } else if (Output.getType() == types::TY_Dependencies) {
       DepFile = Output.getFilename();
     } else if (A->getOption().matches(options::OPT_M) ||
                A->getOption().matches(options::OPT_MM)) {
       DepFile = "-";
     } else {
       DepFile = getDependencyFileName(Args, Inputs);
       C.addFailureResultFile(DepFile, &JA);
     }
     CmdArgs.push_back("-dependency-file");
     CmdArgs.push_back(DepFile);
 
     // Add a default target if one wasn't specified.
     if (!Args.hasArg(options::OPT_MT) && !Args.hasArg(options::OPT_MQ)) {
       const char *DepTarget;
 
       // If user provided -o, that is the dependency target, except
       // when we are only generating a dependency file.
       Arg *OutputOpt = Args.getLastArg(options::OPT_o);
       if (OutputOpt && Output.getType() != types::TY_Dependencies) {
         DepTarget = OutputOpt->getValue();
       } else {
         // Otherwise derive from the base input.
         //
         // FIXME: This should use the computed output file location.
         SmallString<128> P(Inputs[0].getBaseInput());
         llvm::sys::path::replace_extension(P, "o");
         DepTarget = Args.MakeArgString(llvm::sys::path::filename(P));
       }
 
       if (!A->getOption().matches(options::OPT_MD) && !A->getOption().matches(options::OPT_MMD)) {
         CmdArgs.push_back("-w");
       }
       CmdArgs.push_back("-MT");
       SmallString<128> Quoted;
       QuoteTarget(DepTarget, Quoted);
       CmdArgs.push_back(Args.MakeArgString(Quoted));
     }
 
     if (A->getOption().matches(options::OPT_M) ||
         A->getOption().matches(options::OPT_MD))
       CmdArgs.push_back("-sys-header-deps");
     if ((isa<PrecompileJobAction>(JA) &&
          !Args.hasArg(options::OPT_fno_module_file_deps)) ||
         Args.hasArg(options::OPT_fmodule_file_deps))
       CmdArgs.push_back("-module-file-deps");
   }
 
   if (Args.hasArg(options::OPT_MG)) {
     if (!A || A->getOption().matches(options::OPT_MD) ||
         A->getOption().matches(options::OPT_MMD))
       D.Diag(diag::err_drv_mg_requires_m_or_mm);
     CmdArgs.push_back("-MG");
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_MP);
   Args.AddLastArg(CmdArgs, options::OPT_MV);
 
   // Convert all -MQ <target> args to -MT <quoted target>
   for (const Arg *A : Args.filtered(options::OPT_MT, options::OPT_MQ)) {
     A->claim();
 
     if (A->getOption().matches(options::OPT_MQ)) {
       CmdArgs.push_back("-MT");
       SmallString<128> Quoted;
       QuoteTarget(A->getValue(), Quoted);
       CmdArgs.push_back(Args.MakeArgString(Quoted));
 
       // -MT flag - no change
     } else {
       A->render(Args, CmdArgs);
     }
   }
 
   // Add offload include arguments specific for CUDA.  This must happen before
   // we -I or -include anything else, because we must pick up the CUDA headers
   // from the particular CUDA installation, rather than from e.g.
   // /usr/local/include.
   if (JA.isOffloading(Action::OFK_Cuda))
     getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
 
   // Add -i* options, and automatically translate to
   // -include-pch/-include-pth for transparent PCH support. It's
   // wonky, but we include looking for .gch so we can support seamless
   // replacement into a build system already set up to be generating
   // .gch files.
   int YcIndex = -1, YuIndex = -1;
   {
     int AI = -1;
     const Arg *YcArg = Args.getLastArg(options::OPT__SLASH_Yc);
     const Arg *YuArg = Args.getLastArg(options::OPT__SLASH_Yu);
     for (const Arg *A : Args.filtered(options::OPT_clang_i_Group)) {
       // Walk the whole i_Group and skip non "-include" flags so that the index
       // here matches the index in the next loop below.
       ++AI;
       if (!A->getOption().matches(options::OPT_include))
         continue;
       if (YcArg && strcmp(A->getValue(), YcArg->getValue()) == 0)
         YcIndex = AI;
       if (YuArg && strcmp(A->getValue(), YuArg->getValue()) == 0)
         YuIndex = AI;
     }
   }
   if (isa<PrecompileJobAction>(JA) && YcIndex != -1) {
     Driver::InputList Inputs;
     D.BuildInputs(getToolChain(), C.getArgs(), Inputs);
     assert(Inputs.size() == 1 && "Need one input when building pch");
     CmdArgs.push_back(Args.MakeArgString(Twine("-find-pch-source=") +
                                          Inputs[0].second->getValue()));
   }
 
   bool RenderedImplicitInclude = false;
   int AI = -1;
   for (const Arg *A : Args.filtered(options::OPT_clang_i_Group)) {
     ++AI;
 
     if (getToolChain().getDriver().IsCLMode() &&
         A->getOption().matches(options::OPT_include)) {
       // In clang-cl mode, /Ycfoo.h means that all code up to a foo.h
       // include is compiled into foo.h, and everything after goes into
       // the .obj file. /Yufoo.h means that all includes prior to and including
       // foo.h are completely skipped and replaced with a use of the pch file
       // for foo.h.  (Each flag can have at most one value, multiple /Yc flags
       // just mean that the last one wins.)  If /Yc and /Yu are both present
       // and refer to the same file, /Yc wins.
       // Note that OPT__SLASH_FI gets mapped to OPT_include.
       // FIXME: The code here assumes that /Yc and /Yu refer to the same file.
       // cl.exe seems to support both flags with different values, but that
       // seems strange (which flag does /Fp now refer to?), so don't implement
       // that until someone needs it.
       int PchIndex = YcIndex != -1 ? YcIndex : YuIndex;
       if (PchIndex != -1) {
         if (isa<PrecompileJobAction>(JA)) {
           // When building the pch, skip all includes after the pch.
           assert(YcIndex != -1 && PchIndex == YcIndex);
           if (AI >= YcIndex)
             continue;
         } else {
           // When using the pch, skip all includes prior to the pch.
           if (AI < PchIndex) {
             A->claim();
             continue;
           }
           if (AI == PchIndex) {
             A->claim();
             CmdArgs.push_back("-include-pch");
             CmdArgs.push_back(
                 Args.MakeArgString(D.GetClPchPath(C, A->getValue())));
             continue;
           }
         }
       }
     } else if (A->getOption().matches(options::OPT_include)) {
       // Handling of gcc-style gch precompiled headers.
       bool IsFirstImplicitInclude = !RenderedImplicitInclude;
       RenderedImplicitInclude = true;
 
       // Use PCH if the user requested it.
       bool UsePCH = D.CCCUsePCH;
 
       bool FoundPTH = false;
       bool FoundPCH = false;
       SmallString<128> P(A->getValue());
       // We want the files to have a name like foo.h.pch. Add a dummy extension
       // so that replace_extension does the right thing.
       P += ".dummy";
       if (UsePCH) {
         llvm::sys::path::replace_extension(P, "pch");
         if (llvm::sys::fs::exists(P))
           FoundPCH = true;
       }
 
       if (!FoundPCH) {
         llvm::sys::path::replace_extension(P, "pth");
         if (llvm::sys::fs::exists(P))
           FoundPTH = true;
       }
 
       if (!FoundPCH && !FoundPTH) {
         llvm::sys::path::replace_extension(P, "gch");
         if (llvm::sys::fs::exists(P)) {
           FoundPCH = UsePCH;
           FoundPTH = !UsePCH;
         }
       }
 
       if (FoundPCH || FoundPTH) {
         if (IsFirstImplicitInclude) {
           A->claim();
           if (UsePCH)
             CmdArgs.push_back("-include-pch");
           else
             CmdArgs.push_back("-include-pth");
           CmdArgs.push_back(Args.MakeArgString(P));
           continue;
         } else {
           // Ignore the PCH if not first on command line and emit warning.
           D.Diag(diag::warn_drv_pch_not_first_include) << P
                                                        << A->getAsString(Args);
         }
       }
     } else if (A->getOption().matches(options::OPT_isystem_after)) {
       // Handling of paths which must come late.  These entries are handled by
       // the toolchain itself after the resource dir is inserted in the right
       // search order.
       // Do not claim the argument so that the use of the argument does not
       // silently go unnoticed on toolchains which do not honour the option.
       continue;
     }
 
     // Not translated, render as usual.
     A->claim();
     A->render(Args, CmdArgs);
   }
 
   Args.AddAllArgs(CmdArgs,
                   {options::OPT_D, options::OPT_U, options::OPT_I_Group,
                    options::OPT_F, options::OPT_index_header_map});
 
   // Add -Wp, and -Xpreprocessor if using the preprocessor.
 
   // FIXME: There is a very unfortunate problem here, some troubled
   // souls abuse -Wp, to pass preprocessor options in gcc syntax. To
   // really support that we would have to parse and then translate
   // those options. :(
   Args.AddAllArgValues(CmdArgs, options::OPT_Wp_COMMA,
                        options::OPT_Xpreprocessor);
 
   // -I- is a deprecated GCC feature, reject it.
   if (Arg *A = Args.getLastArg(options::OPT_I_))
     D.Diag(diag::err_drv_I_dash_not_supported) << A->getAsString(Args);
 
   // If we have a --sysroot, and don't have an explicit -isysroot flag, add an
   // -isysroot to the CC1 invocation.
   StringRef sysroot = C.getSysRoot();
   if (sysroot != "") {
     if (!Args.hasArg(options::OPT_isysroot)) {
       CmdArgs.push_back("-isysroot");
       CmdArgs.push_back(C.getArgs().MakeArgString(sysroot));
     }
   }
 
   // Parse additional include paths from environment variables.
   // FIXME: We should probably sink the logic for handling these from the
   // frontend into the driver. It will allow deleting 4 otherwise unused flags.
   // CPATH - included following the user specified includes (but prior to
   // builtin and standard includes).
   addDirectoryList(Args, CmdArgs, "-I", "CPATH");
   // C_INCLUDE_PATH - system includes enabled when compiling C.
   addDirectoryList(Args, CmdArgs, "-c-isystem", "C_INCLUDE_PATH");
   // CPLUS_INCLUDE_PATH - system includes enabled when compiling C++.
   addDirectoryList(Args, CmdArgs, "-cxx-isystem", "CPLUS_INCLUDE_PATH");
   // OBJC_INCLUDE_PATH - system includes enabled when compiling ObjC.
   addDirectoryList(Args, CmdArgs, "-objc-isystem", "OBJC_INCLUDE_PATH");
   // OBJCPLUS_INCLUDE_PATH - system includes enabled when compiling ObjC++.
   addDirectoryList(Args, CmdArgs, "-objcxx-isystem", "OBJCPLUS_INCLUDE_PATH");
 
   // While adding the include arguments, we also attempt to retrieve the
   // arguments of related offloading toolchains or arguments that are specific
   // of an offloading programming model.
 
   // Add C++ include arguments, if needed.
   if (types::isCXX(Inputs[0].getType()))
     forAllAssociatedToolChains(C, JA, getToolChain(),
                                [&Args, &CmdArgs](const ToolChain &TC) {
                                  TC.AddClangCXXStdlibIncludeArgs(Args, CmdArgs);
                                });
 
   // Add system include arguments for all targets but IAMCU.
   if (!IsIAMCU)
     forAllAssociatedToolChains(C, JA, getToolChain(),
                                [&Args, &CmdArgs](const ToolChain &TC) {
                                  TC.AddClangSystemIncludeArgs(Args, CmdArgs);
                                });
   else {
     // For IAMCU add special include arguments.
     getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs);
   }
 }
 
 // FIXME: Move to target hook.
 static bool isSignedCharDefault(const llvm::Triple &Triple) {
   switch (Triple.getArch()) {
   default:
     return true;
 
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be:
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
     if (Triple.isOSDarwin() || Triple.isOSWindows())
       return true;
     return false;
 
   case llvm::Triple::ppc:
   case llvm::Triple::ppc64:
     if (Triple.isOSDarwin())
       return true;
     return false;
 
   case llvm::Triple::hexagon:
   case llvm::Triple::ppc64le:
   case llvm::Triple::systemz:
   case llvm::Triple::xcore:
     return false;
   }
 }
 
 static bool isNoCommonDefault(const llvm::Triple &Triple) {
   switch (Triple.getArch()) {
   default:
     return false;
 
   case llvm::Triple::xcore:
   case llvm::Triple::wasm32:
   case llvm::Triple::wasm64:
     return true;
   }
 }
 
 void Clang::AddARMTargetArgs(const llvm::Triple &Triple, const ArgList &Args,
                              ArgStringList &CmdArgs, bool KernelOrKext) const {
   // Select the ABI to use.
   // FIXME: Support -meabi.
   // FIXME: Parts of this are duplicated in the backend, unify this somehow.
   const char *ABIName = nullptr;
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ))
     ABIName = A->getValue();
   else {
     std::string CPU = getCPUName(Args, Triple, /*FromAs*/ false);
     ABIName = llvm::ARM::computeDefaultTargetABI(Triple, CPU).data();
   }
 
   CmdArgs.push_back("-target-abi");
   CmdArgs.push_back(ABIName);
 
   // Determine floating point ABI from the options & target defaults.
   arm::FloatABI ABI = arm::getARMFloatABI(getToolChain(), Args);
   if (ABI == arm::FloatABI::Soft) {
     // Floating point operations and argument passing are soft.
     // FIXME: This changes CPP defines, we need -target-soft-float.
     CmdArgs.push_back("-msoft-float");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
   } else if (ABI == arm::FloatABI::SoftFP) {
     // Floating point operations are hard, but argument passing is soft.
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
   } else {
     // Floating point operations and argument passing are hard.
     assert(ABI == arm::FloatABI::Hard && "Invalid float abi!");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("hard");
   }
 
   // Forward the -mglobal-merge option for explicit control over the pass.
   if (Arg *A = Args.getLastArg(options::OPT_mglobal_merge,
                                options::OPT_mno_global_merge)) {
     CmdArgs.push_back("-backend-option");
     if (A->getOption().matches(options::OPT_mno_global_merge))
       CmdArgs.push_back("-arm-global-merge=false");
     else
       CmdArgs.push_back("-arm-global-merge=true");
   }
 
   if (!Args.hasFlag(options::OPT_mimplicit_float,
                     options::OPT_mno_implicit_float, true))
     CmdArgs.push_back("-no-implicit-float");
 }
 
 void Clang::RenderTargetOptions(const llvm::Triple &EffectiveTriple,
                                 const ArgList &Args, bool KernelOrKext,
                                 ArgStringList &CmdArgs) const {
   const ToolChain &TC = getToolChain();
 
   // Add the target features
   getTargetFeatures(TC, EffectiveTriple, Args, CmdArgs, false);
 
   // Add target specific flags.
   switch (TC.getArch()) {
   default:
     break;
 
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
     // Use the effective triple, which takes into account the deployment target.
     AddARMTargetArgs(EffectiveTriple, Args, CmdArgs, KernelOrKext);
     CmdArgs.push_back("-fallow-half-arguments-and-returns");
     break;
 
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be:
     AddAArch64TargetArgs(Args, CmdArgs);
     CmdArgs.push_back("-fallow-half-arguments-and-returns");
     break;
 
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
     AddMIPSTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::ppc:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
     AddPPCTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
   case llvm::Triple::sparcv9:
     AddSparcTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::systemz:
     AddSystemZTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     AddX86TargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::lanai:
     AddLanaiTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::hexagon:
     AddHexagonTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::wasm32:
   case llvm::Triple::wasm64:
     AddWebAssemblyTargetArgs(Args, CmdArgs);
     break;
   }
 }
 
 void Clang::AddAArch64TargetArgs(const ArgList &Args,
                                  ArgStringList &CmdArgs) const {
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
 
   if (!Args.hasFlag(options::OPT_mred_zone, options::OPT_mno_red_zone, true) ||
       Args.hasArg(options::OPT_mkernel) ||
       Args.hasArg(options::OPT_fapple_kext))
     CmdArgs.push_back("-disable-red-zone");
 
   if (!Args.hasFlag(options::OPT_mimplicit_float,
                     options::OPT_mno_implicit_float, true))
     CmdArgs.push_back("-no-implicit-float");
 
   const char *ABIName = nullptr;
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ))
     ABIName = A->getValue();
   else if (Triple.isOSDarwin())
     ABIName = "darwinpcs";
   else
     ABIName = "aapcs";
 
   CmdArgs.push_back("-target-abi");
   CmdArgs.push_back(ABIName);
 
   if (Arg *A = Args.getLastArg(options::OPT_mfix_cortex_a53_835769,
                                options::OPT_mno_fix_cortex_a53_835769)) {
     CmdArgs.push_back("-backend-option");
     if (A->getOption().matches(options::OPT_mfix_cortex_a53_835769))
       CmdArgs.push_back("-aarch64-fix-cortex-a53-835769=1");
     else
       CmdArgs.push_back("-aarch64-fix-cortex-a53-835769=0");
   } else if (Triple.isAndroid()) {
     // Enabled A53 errata (835769) workaround by default on android
     CmdArgs.push_back("-backend-option");
     CmdArgs.push_back("-aarch64-fix-cortex-a53-835769=1");
   }
 
   // Forward the -mglobal-merge option for explicit control over the pass.
   if (Arg *A = Args.getLastArg(options::OPT_mglobal_merge,
                                options::OPT_mno_global_merge)) {
     CmdArgs.push_back("-backend-option");
     if (A->getOption().matches(options::OPT_mno_global_merge))
       CmdArgs.push_back("-aarch64-enable-global-merge=false");
     else
       CmdArgs.push_back("-aarch64-enable-global-merge=true");
   }
 }
 
 void Clang::AddMIPSTargetArgs(const ArgList &Args,
                               ArgStringList &CmdArgs) const {
   const Driver &D = getToolChain().getDriver();
   StringRef CPUName;
   StringRef ABIName;
   const llvm::Triple &Triple = getToolChain().getTriple();
   mips::getMipsCPUAndABI(Args, Triple, CPUName, ABIName);
 
   CmdArgs.push_back("-target-abi");
   CmdArgs.push_back(ABIName.data());
 
   mips::FloatABI ABI = mips::getMipsFloatABI(D, Args);
   if (ABI == mips::FloatABI::Soft) {
     // Floating point operations and argument passing are soft.
     CmdArgs.push_back("-msoft-float");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
   } else {
     // Floating point operations and argument passing are hard.
     assert(ABI == mips::FloatABI::Hard && "Invalid float abi!");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("hard");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mxgot, options::OPT_mno_xgot)) {
     if (A->getOption().matches(options::OPT_mxgot)) {
       CmdArgs.push_back("-mllvm");
       CmdArgs.push_back("-mxgot");
     }
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mldc1_sdc1,
                                options::OPT_mno_ldc1_sdc1)) {
     if (A->getOption().matches(options::OPT_mno_ldc1_sdc1)) {
       CmdArgs.push_back("-mllvm");
       CmdArgs.push_back("-mno-ldc1-sdc1");
     }
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mcheck_zero_division,
                                options::OPT_mno_check_zero_division)) {
     if (A->getOption().matches(options::OPT_mno_check_zero_division)) {
       CmdArgs.push_back("-mllvm");
       CmdArgs.push_back("-mno-check-zero-division");
     }
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_G)) {
     StringRef v = A->getValue();
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back(Args.MakeArgString("-mips-ssection-threshold=" + v));
     A->claim();
   }
 
   Arg *GPOpt = Args.getLastArg(options::OPT_mgpopt, options::OPT_mno_gpopt);
   Arg *ABICalls =
       Args.getLastArg(options::OPT_mabicalls, options::OPT_mno_abicalls);
 
   // -mabicalls is the default for many MIPS environments, even with -fno-pic.
   // -mgpopt is the default for static, -fno-pic environments but these two
   // options conflict. We want to be certain that -mno-abicalls -mgpopt is
   // the only case where -mllvm -mgpopt is passed.
   // NOTE: We need a warning here or in the backend to warn when -mgpopt is
   //       passed explicitly when compiling something with -mabicalls
   //       (implictly) in affect. Currently the warning is in the backend.
   //
   // When the ABI in use is  N64, we also need to determine the PIC mode that
   // is in use, as -fno-pic for N64 implies -mno-abicalls.
   bool NoABICalls =
       ABICalls && ABICalls->getOption().matches(options::OPT_mno_abicalls);
 
   llvm::Reloc::Model RelocationModel;
   unsigned PICLevel;
   bool IsPIE;
   std::tie(RelocationModel, PICLevel, IsPIE) =
       ParsePICArgs(getToolChain(), Args);
 
   NoABICalls = NoABICalls ||
                (RelocationModel == llvm::Reloc::Static && ABIName == "n64");
 
   bool WantGPOpt = GPOpt && GPOpt->getOption().matches(options::OPT_mgpopt);
   // We quietly ignore -mno-gpopt as the backend defaults to -mno-gpopt.
   if (NoABICalls && (!GPOpt || WantGPOpt)) {
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back("-mgpopt");
 
     Arg *LocalSData = Args.getLastArg(options::OPT_mlocal_sdata,
                                       options::OPT_mno_local_sdata);
     Arg *ExternSData = Args.getLastArg(options::OPT_mextern_sdata,
                                        options::OPT_mno_extern_sdata);
     Arg *EmbeddedData = Args.getLastArg(options::OPT_membedded_data,
                                         options::OPT_mno_embedded_data);
     if (LocalSData) {
       CmdArgs.push_back("-mllvm");
       if (LocalSData->getOption().matches(options::OPT_mlocal_sdata)) {
         CmdArgs.push_back("-mlocal-sdata=1");
       } else {
         CmdArgs.push_back("-mlocal-sdata=0");
       }
       LocalSData->claim();
     }
 
     if (ExternSData) {
       CmdArgs.push_back("-mllvm");
       if (ExternSData->getOption().matches(options::OPT_mextern_sdata)) {
         CmdArgs.push_back("-mextern-sdata=1");
       } else {
         CmdArgs.push_back("-mextern-sdata=0");
       }
       ExternSData->claim();
     }
 
     if (EmbeddedData) {
       CmdArgs.push_back("-mllvm");
       if (EmbeddedData->getOption().matches(options::OPT_membedded_data)) {
         CmdArgs.push_back("-membedded-data=1");
       } else {
         CmdArgs.push_back("-membedded-data=0");
       }
       EmbeddedData->claim();
     }
 
   } else if ((!ABICalls || (!NoABICalls && ABICalls)) && WantGPOpt)
     D.Diag(diag::warn_drv_unsupported_gpopt) << (ABICalls ? 0 : 1);
 
   if (GPOpt)
     GPOpt->claim();
 
   if (Arg *A = Args.getLastArg(options::OPT_mcompact_branches_EQ)) {
     StringRef Val = StringRef(A->getValue());
     if (mips::hasCompactBranches(CPUName)) {
       if (Val == "never" || Val == "always" || Val == "optimal") {
         CmdArgs.push_back("-mllvm");
         CmdArgs.push_back(Args.MakeArgString("-mips-compact-branches=" + Val));
       } else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getOption().getName() << Val;
     } else
       D.Diag(diag::warn_target_unsupported_compact_branches) << CPUName;
   }
 }
 
 void Clang::AddPPCTargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   // Select the ABI to use.
   const char *ABIName = nullptr;
   if (getToolChain().getTriple().isOSLinux())
     switch (getToolChain().getArch()) {
     case llvm::Triple::ppc64: {
       // When targeting a processor that supports QPX, or if QPX is
       // specifically enabled, default to using the ABI that supports QPX (so
       // long as it is not specifically disabled).
       bool HasQPX = false;
       if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ))
         HasQPX = A->getValue() == StringRef("a2q");
       HasQPX = Args.hasFlag(options::OPT_mqpx, options::OPT_mno_qpx, HasQPX);
       if (HasQPX) {
         ABIName = "elfv1-qpx";
         break;
       }
 
       ABIName = "elfv1";
       break;
     }
     case llvm::Triple::ppc64le:
       ABIName = "elfv2";
       break;
     default:
       break;
     }
 
   if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ))
     // The ppc64 linux abis are all "altivec" abis by default. Accept and ignore
     // the option if given as we don't have backend support for any targets
     // that don't use the altivec abi.
     if (StringRef(A->getValue()) != "altivec")
       ABIName = A->getValue();
 
   ppc::FloatABI FloatABI =
       ppc::getPPCFloatABI(getToolChain().getDriver(), Args);
 
   if (FloatABI == ppc::FloatABI::Soft) {
     // Floating point operations and argument passing are soft.
     CmdArgs.push_back("-msoft-float");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
   } else {
     // Floating point operations and argument passing are hard.
     assert(FloatABI == ppc::FloatABI::Hard && "Invalid float abi!");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("hard");
   }
 
   if (ABIName) {
     CmdArgs.push_back("-target-abi");
     CmdArgs.push_back(ABIName);
   }
 }
 
 void Clang::AddSparcTargetArgs(const ArgList &Args,
                                ArgStringList &CmdArgs) const {
   sparc::FloatABI FloatABI =
       sparc::getSparcFloatABI(getToolChain().getDriver(), Args);
 
   if (FloatABI == sparc::FloatABI::Soft) {
     // Floating point operations and argument passing are soft.
     CmdArgs.push_back("-msoft-float");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
   } else {
     // Floating point operations and argument passing are hard.
     assert(FloatABI == sparc::FloatABI::Hard && "Invalid float abi!");
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("hard");
   }
 }
 
 void Clang::AddSystemZTargetArgs(const ArgList &Args,
                                  ArgStringList &CmdArgs) const {
   if (Args.hasFlag(options::OPT_mbackchain, options::OPT_mno_backchain, false))
     CmdArgs.push_back("-mbackchain");
 }
 
 void Clang::AddX86TargetArgs(const ArgList &Args,
                              ArgStringList &CmdArgs) const {
   if (!Args.hasFlag(options::OPT_mred_zone, options::OPT_mno_red_zone, true) ||
       Args.hasArg(options::OPT_mkernel) ||
       Args.hasArg(options::OPT_fapple_kext))
     CmdArgs.push_back("-disable-red-zone");
 
   // Default to avoid implicit floating-point for kernel/kext code, but allow
   // that to be overridden with -mno-soft-float.
   bool NoImplicitFloat = (Args.hasArg(options::OPT_mkernel) ||
                           Args.hasArg(options::OPT_fapple_kext));
   if (Arg *A = Args.getLastArg(
           options::OPT_msoft_float, options::OPT_mno_soft_float,
           options::OPT_mimplicit_float, options::OPT_mno_implicit_float)) {
     const Option &O = A->getOption();
     NoImplicitFloat = (O.matches(options::OPT_mno_implicit_float) ||
                        O.matches(options::OPT_msoft_float));
   }
   if (NoImplicitFloat)
     CmdArgs.push_back("-no-implicit-float");
 
   if (Arg *A = Args.getLastArg(options::OPT_masm_EQ)) {
     StringRef Value = A->getValue();
     if (Value == "intel" || Value == "att") {
       CmdArgs.push_back("-mllvm");
       CmdArgs.push_back(Args.MakeArgString("-x86-asm-syntax=" + Value));
     } else {
       getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument)
           << A->getOption().getName() << Value;
     }
   }
 
   // Set flags to support MCU ABI.
   if (Args.hasFlag(options::OPT_miamcu, options::OPT_mno_iamcu, false)) {
     CmdArgs.push_back("-mfloat-abi");
     CmdArgs.push_back("soft");
     CmdArgs.push_back("-mstack-alignment=4");
   }
 }
 
 void Clang::AddHexagonTargetArgs(const ArgList &Args,
                                  ArgStringList &CmdArgs) const {
   CmdArgs.push_back("-mqdsp6-compat");
   CmdArgs.push_back("-Wreturn-type");
 
   if (auto G = toolchains::HexagonToolChain::getSmallDataThreshold(Args)) {
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back(Args.MakeArgString("-hexagon-small-data-threshold=" +
                                          Twine(G.getValue())));
   }
 
   if (!Args.hasArg(options::OPT_fno_short_enums))
     CmdArgs.push_back("-fshort-enums");
   if (Args.getLastArg(options::OPT_mieee_rnd_near)) {
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back("-enable-hexagon-ieee-rnd-near");
   }
   CmdArgs.push_back("-mllvm");
   CmdArgs.push_back("-machine-sink-split=0");
 }
 
 void Clang::AddLanaiTargetArgs(const ArgList &Args,
                                ArgStringList &CmdArgs) const {
   if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ)) {
     StringRef CPUName = A->getValue();
 
     CmdArgs.push_back("-target-cpu");
     CmdArgs.push_back(Args.MakeArgString(CPUName));
   }
   if (Arg *A = Args.getLastArg(options::OPT_mregparm_EQ)) {
     StringRef Value = A->getValue();
     // Only support mregparm=4 to support old usage. Report error for all other
     // cases.
     int Mregparm;
     if (Value.getAsInteger(10, Mregparm)) {
       if (Mregparm != 4) {
         getToolChain().getDriver().Diag(
             diag::err_drv_unsupported_option_argument)
             << A->getOption().getName() << Value;
       }
     }
   }
 }
 
 void Clang::AddWebAssemblyTargetArgs(const ArgList &Args,
                                      ArgStringList &CmdArgs) const {
   // Default to "hidden" visibility.
   if (!Args.hasArg(options::OPT_fvisibility_EQ,
                    options::OPT_fvisibility_ms_compat)) {
     CmdArgs.push_back("-fvisibility");
     CmdArgs.push_back("hidden");
   }
 }
 
 void Clang::DumpCompilationDatabase(Compilation &C, StringRef Filename,
                                     StringRef Target, const InputInfo &Output,
                                     const InputInfo &Input, const ArgList &Args) const {
   // If this is a dry run, do not create the compilation database file.
   if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH))
     return;
 
   using llvm::yaml::escape;
   const Driver &D = getToolChain().getDriver();
 
   if (!CompilationDatabase) {
     std::error_code EC;
     auto File = llvm::make_unique<llvm::raw_fd_ostream>(Filename, EC, llvm::sys::fs::F_Text);
     if (EC) {
       D.Diag(clang::diag::err_drv_compilationdatabase) << Filename
                                                        << EC.message();
       return;
     }
     CompilationDatabase = std::move(File);
   }
   auto &CDB = *CompilationDatabase;
   SmallString<128> Buf;
   if (llvm::sys::fs::current_path(Buf))
     Buf = ".";
   CDB << "{ \"directory\": \"" << escape(Buf) << "\"";
   CDB << ", \"file\": \"" << escape(Input.getFilename()) << "\"";
   CDB << ", \"output\": \"" << escape(Output.getFilename()) << "\"";
   CDB << ", \"arguments\": [\"" << escape(D.ClangExecutable) << "\"";
   Buf = "-x";
   Buf += types::getTypeName(Input.getType());
   CDB << ", \"" << escape(Buf) << "\"";
   if (!D.SysRoot.empty() && !Args.hasArg(options::OPT__sysroot_EQ)) {
     Buf = "--sysroot=";
     Buf += D.SysRoot;
     CDB << ", \"" << escape(Buf) << "\"";
   }
   CDB << ", \"" << escape(Input.getFilename()) << "\"";
   for (auto &A: Args) {
     auto &O = A->getOption();
     // Skip language selection, which is positional.
     if (O.getID() == options::OPT_x)
       continue;
     // Skip writing dependency output and the compilation database itself.
     if (O.getGroup().isValid() && O.getGroup().getID() == options::OPT_M_Group)
       continue;
     // Skip inputs.
     if (O.getKind() == Option::InputClass)
       continue;
     // All other arguments are quoted and appended.
     ArgStringList ASL;
     A->render(Args, ASL);
     for (auto &it: ASL)
       CDB << ", \"" << escape(it) << "\"";
   }
   Buf = "--target=";
   Buf += Target;
   CDB << ", \"" << escape(Buf) << "\"]},\n";
 }
 
 static void CollectArgsForIntegratedAssembler(Compilation &C,
                                               const ArgList &Args,
                                               ArgStringList &CmdArgs,
                                               const Driver &D) {
   if (UseRelaxAll(C, Args))
     CmdArgs.push_back("-mrelax-all");
 
   // Only default to -mincremental-linker-compatible if we think we are
   // targeting the MSVC linker.
   bool DefaultIncrementalLinkerCompatible =
       C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment();
   if (Args.hasFlag(options::OPT_mincremental_linker_compatible,
                    options::OPT_mno_incremental_linker_compatible,
                    DefaultIncrementalLinkerCompatible))
     CmdArgs.push_back("-mincremental-linker-compatible");
 
   switch (C.getDefaultToolChain().getArch()) {
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
     if (Arg *A = Args.getLastArg(options::OPT_mimplicit_it_EQ)) {
       StringRef Value = A->getValue();
       if (Value == "always" || Value == "never" || Value == "arm" ||
           Value == "thumb") {
         CmdArgs.push_back("-mllvm");
         CmdArgs.push_back(Args.MakeArgString("-arm-implicit-it=" + Value));
       } else {
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getOption().getName() << Value;
       }
     }
     break;
   default:
     break;
   }
 
   // When passing -I arguments to the assembler we sometimes need to
   // unconditionally take the next argument.  For example, when parsing
   // '-Wa,-I -Wa,foo' we need to accept the -Wa,foo arg after seeing the
   // -Wa,-I arg and when parsing '-Wa,-I,foo' we need to accept the 'foo'
   // arg after parsing the '-I' arg.
   bool TakeNextArg = false;
 
   bool UseRelaxRelocations = C.getDefaultToolChain().useRelaxRelocations();
   const char *MipsTargetFeature = nullptr;
   for (const Arg *A :
        Args.filtered(options::OPT_Wa_COMMA, options::OPT_Xassembler)) {
     A->claim();
 
     for (StringRef Value : A->getValues()) {
       if (TakeNextArg) {
         CmdArgs.push_back(Value.data());
         TakeNextArg = false;
         continue;
       }
 
       if (C.getDefaultToolChain().getTriple().isOSBinFormatCOFF() &&
           Value == "-mbig-obj")
         continue; // LLVM handles bigobj automatically
 
       switch (C.getDefaultToolChain().getArch()) {
       default:
         break;
       case llvm::Triple::thumb:
       case llvm::Triple::thumbeb:
       case llvm::Triple::arm:
       case llvm::Triple::armeb:
         if (Value == "-mthumb")
           // -mthumb has already been processed in ComputeLLVMTriple()
           // recognize but skip over here.
           continue;
         break;
       case llvm::Triple::mips:
       case llvm::Triple::mipsel:
       case llvm::Triple::mips64:
       case llvm::Triple::mips64el:
         if (Value == "--trap") {
           CmdArgs.push_back("-target-feature");
           CmdArgs.push_back("+use-tcc-in-div");
           continue;
         }
         if (Value == "--break") {
           CmdArgs.push_back("-target-feature");
           CmdArgs.push_back("-use-tcc-in-div");
           continue;
         }
         if (Value.startswith("-msoft-float")) {
           CmdArgs.push_back("-target-feature");
           CmdArgs.push_back("+soft-float");
           continue;
         }
         if (Value.startswith("-mhard-float")) {
           CmdArgs.push_back("-target-feature");
           CmdArgs.push_back("-soft-float");
           continue;
         }
 
         MipsTargetFeature = llvm::StringSwitch<const char *>(Value)
                                 .Case("-mips1", "+mips1")
                                 .Case("-mips2", "+mips2")
                                 .Case("-mips3", "+mips3")
                                 .Case("-mips4", "+mips4")
                                 .Case("-mips5", "+mips5")
                                 .Case("-mips32", "+mips32")
                                 .Case("-mips32r2", "+mips32r2")
                                 .Case("-mips32r3", "+mips32r3")
                                 .Case("-mips32r5", "+mips32r5")
                                 .Case("-mips32r6", "+mips32r6")
                                 .Case("-mips64", "+mips64")
                                 .Case("-mips64r2", "+mips64r2")
                                 .Case("-mips64r3", "+mips64r3")
                                 .Case("-mips64r5", "+mips64r5")
                                 .Case("-mips64r6", "+mips64r6")
                                 .Default(nullptr);
         if (MipsTargetFeature)
           continue;
       }
 
       if (Value == "-force_cpusubtype_ALL") {
         // Do nothing, this is the default and we don't support anything else.
       } else if (Value == "-L") {
         CmdArgs.push_back("-msave-temp-labels");
       } else if (Value == "--fatal-warnings") {
         CmdArgs.push_back("-massembler-fatal-warnings");
       } else if (Value == "--noexecstack") {
         CmdArgs.push_back("-mnoexecstack");
       } else if (Value.startswith("-compress-debug-sections") ||
                  Value.startswith("--compress-debug-sections") ||
                  Value == "-nocompress-debug-sections" ||
                  Value == "--nocompress-debug-sections") {
         CmdArgs.push_back(Value.data());
       } else if (Value == "-mrelax-relocations=yes" ||
                  Value == "--mrelax-relocations=yes") {
         UseRelaxRelocations = true;
       } else if (Value == "-mrelax-relocations=no" ||
                  Value == "--mrelax-relocations=no") {
         UseRelaxRelocations = false;
       } else if (Value.startswith("-I")) {
         CmdArgs.push_back(Value.data());
         // We need to consume the next argument if the current arg is a plain
         // -I. The next arg will be the include directory.
         if (Value == "-I")
           TakeNextArg = true;
       } else if (Value.startswith("-gdwarf-")) {
         // "-gdwarf-N" options are not cc1as options.
         unsigned DwarfVersion = DwarfVersionNum(Value);
         if (DwarfVersion == 0) { // Send it onward, and let cc1as complain.
           CmdArgs.push_back(Value.data());
         } else {
           RenderDebugEnablingArgs(Args, CmdArgs,
                                   codegenoptions::LimitedDebugInfo,
                                   DwarfVersion, llvm::DebuggerKind::Default);
         }
       } else if (Value.startswith("-mcpu") || Value.startswith("-mfpu") ||
                  Value.startswith("-mhwdiv") || Value.startswith("-march")) {
         // Do nothing, we'll validate it later.
       } else if (Value == "-defsym") {
           if (A->getNumValues() != 2) {
             D.Diag(diag::err_drv_defsym_invalid_format) << Value;
             break;
           }
           const char *S = A->getValue(1);
           auto Pair = StringRef(S).split('=');
           auto Sym = Pair.first;
           auto SVal = Pair.second;
 
           if (Sym.empty() || SVal.empty()) {
             D.Diag(diag::err_drv_defsym_invalid_format) << S;
             break;
           }
           int64_t IVal;
           if (SVal.getAsInteger(0, IVal)) {
             D.Diag(diag::err_drv_defsym_invalid_symval) << SVal;
             break;
           }
           CmdArgs.push_back(Value.data());
           TakeNextArg = true;
       } else {
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getOption().getName() << Value;
       }
     }
   }
   if (UseRelaxRelocations)
     CmdArgs.push_back("--mrelax-relocations");
   if (MipsTargetFeature != nullptr) {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back(MipsTargetFeature);
   }
 }
 
 static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                                        bool OFastEnabled, const ArgList &Args,
                                        ArgStringList &CmdArgs) {
   // Handle various floating point optimization flags, mapping them to the
   // appropriate LLVM code generation flags. This is complicated by several
   // "umbrella" flags, so we do this by stepping through the flags incrementally
   // adjusting what we think is enabled/disabled, then at the end settting the
   // LLVM flags based on the final state.
   bool HonorINFs = true;
   bool HonorNaNs = true;
   // -fmath-errno is the default on some platforms, e.g. BSD-derived OSes.
   bool MathErrno = TC.IsMathErrnoDefault();
   bool AssociativeMath = false;
   bool ReciprocalMath = false;
   bool SignedZeros = true;
   bool TrappingMath = true;
   StringRef DenormalFPMath = "";
   StringRef FPContract = "";
 
   for (const Arg *A : Args) {
     switch (A->getOption().getID()) {
     // If this isn't an FP option skip the claim below
     default: continue;
 
     // Options controlling individual features
     case options::OPT_fhonor_infinities:    HonorINFs = true;         break;
     case options::OPT_fno_honor_infinities: HonorINFs = false;        break;
     case options::OPT_fhonor_nans:          HonorNaNs = true;         break;
     case options::OPT_fno_honor_nans:       HonorNaNs = false;        break;
     case options::OPT_fmath_errno:          MathErrno = true;         break;
     case options::OPT_fno_math_errno:       MathErrno = false;        break;
     case options::OPT_fassociative_math:    AssociativeMath = true;   break;
     case options::OPT_fno_associative_math: AssociativeMath = false;  break;
     case options::OPT_freciprocal_math:     ReciprocalMath = true;    break;
     case options::OPT_fno_reciprocal_math:  ReciprocalMath = false;   break;
     case options::OPT_fsigned_zeros:        SignedZeros = true;       break;
     case options::OPT_fno_signed_zeros:     SignedZeros = false;      break;
     case options::OPT_ftrapping_math:       TrappingMath = true;      break;
     case options::OPT_fno_trapping_math:    TrappingMath = false;     break;
 
     case options::OPT_fdenormal_fp_math_EQ:
       DenormalFPMath = A->getValue();
       break;
 
     // Validate and pass through -fp-contract option.
     case options::OPT_ffp_contract: {
       StringRef Val = A->getValue();
       if (Val == "fast" || Val == "on" || Val == "off")
         FPContract = Val;
       else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getOption().getName() << Val;
       break;
     }
 
     case options::OPT_ffinite_math_only:
       HonorINFs = false;
       HonorNaNs = false;
       break;
     case options::OPT_fno_finite_math_only:
       HonorINFs = true;
       HonorNaNs = true;
       break;
 
     case options::OPT_funsafe_math_optimizations:
       AssociativeMath = true;
       ReciprocalMath = true;
       SignedZeros = false;
       TrappingMath = false;
       break;
     case options::OPT_fno_unsafe_math_optimizations:
       AssociativeMath = false;
       ReciprocalMath = false;
       SignedZeros = true;
       TrappingMath = true;
       // -fno_unsafe_math_optimizations restores default denormal handling
       DenormalFPMath = "";
       break;
 
     case options::OPT_Ofast:
       // If -Ofast is the optimization level, then -ffast-math should be enabled
       if (!OFastEnabled)
         continue;
       LLVM_FALLTHROUGH;
     case options::OPT_ffast_math:
       HonorINFs = false;
       HonorNaNs = false;
       MathErrno = false;
       AssociativeMath = true;
       ReciprocalMath = true;
       SignedZeros = false;
       TrappingMath = false;
       // If fast-math is set then set the fp-contract mode to fast.
       FPContract = "fast";
       break;
     case options::OPT_fno_fast_math:
       HonorINFs = true;
       HonorNaNs = true;
       // Turning on -ffast-math (with either flag) removes the need for
       // MathErrno. However, turning *off* -ffast-math merely restores the
       // toolchain default (which may be false).
       MathErrno = TC.IsMathErrnoDefault();
       AssociativeMath = false;
       ReciprocalMath = false;
       SignedZeros = true;
       TrappingMath = true;
       // -fno_fast_math restores default denormal and fpcontract handling
       DenormalFPMath = "";
       FPContract = "";
       break;
     }
 
     // If we handled this option claim it
     A->claim();
   }
 
   if (!HonorINFs)
     CmdArgs.push_back("-menable-no-infs");
 
   if (!HonorNaNs)
     CmdArgs.push_back("-menable-no-nans");
 
   if (MathErrno)
     CmdArgs.push_back("-fmath-errno");
 
   if (!MathErrno && AssociativeMath && ReciprocalMath && !SignedZeros &&
       !TrappingMath)
     CmdArgs.push_back("-menable-unsafe-fp-math");
 
   if (!SignedZeros)
     CmdArgs.push_back("-fno-signed-zeros");
 
   if (AssociativeMath && !SignedZeros && !TrappingMath)
     CmdArgs.push_back("-mreassociate");
 
   if (ReciprocalMath)
     CmdArgs.push_back("-freciprocal-math");
 
   if (!TrappingMath)
     CmdArgs.push_back("-fno-trapping-math");
 
   if (!DenormalFPMath.empty())
     CmdArgs.push_back(
         Args.MakeArgString("-fdenormal-fp-math=" + DenormalFPMath));
 
   if (!FPContract.empty())
     CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + FPContract));
 
   ParseMRecip(D, Args, CmdArgs);
 
   // -ffast-math enables the __FAST_MATH__ preprocessor macro, but check for the
   // individual features enabled by -ffast-math instead of the option itself as
   // that's consistent with gcc's behaviour.
   if (!HonorINFs && !HonorNaNs && !MathErrno && AssociativeMath &&
       ReciprocalMath && !SignedZeros && !TrappingMath)
     CmdArgs.push_back("-ffast-math");
 
   // Handle __FINITE_MATH_ONLY__ similarly.
   if (!HonorINFs && !HonorNaNs)
     CmdArgs.push_back("-ffinite-math-only");
 
   if (const Arg *A = Args.getLastArg(options::OPT_mfpmath_EQ)) {
     CmdArgs.push_back("-mfpmath");
     CmdArgs.push_back(A->getValue());
   }
 }
 
 static void RenderAnalyzerOptions(const ArgList &Args, ArgStringList &CmdArgs,
                                   const llvm::Triple &Triple,
                                   const InputInfo &Input) {
   // Enable region store model by default.
   CmdArgs.push_back("-analyzer-store=region");
 
   // Treat blocks as analysis entry points.
   CmdArgs.push_back("-analyzer-opt-analyze-nested-blocks");
 
   CmdArgs.push_back("-analyzer-eagerly-assume");
 
   // Add default argument set.
   if (!Args.hasArg(options::OPT__analyzer_no_default_checks)) {
     CmdArgs.push_back("-analyzer-checker=core");
     CmdArgs.push_back("-analyzer-checker=apiModeling");
 
     if (!Triple.isWindowsMSVCEnvironment()) {
       CmdArgs.push_back("-analyzer-checker=unix");
     } else {
       // Enable "unix" checkers that also work on Windows.
       CmdArgs.push_back("-analyzer-checker=unix.API");
       CmdArgs.push_back("-analyzer-checker=unix.Malloc");
       CmdArgs.push_back("-analyzer-checker=unix.MallocSizeof");
       CmdArgs.push_back("-analyzer-checker=unix.MismatchedDeallocator");
       CmdArgs.push_back("-analyzer-checker=unix.cstring.BadSizeArg");
       CmdArgs.push_back("-analyzer-checker=unix.cstring.NullArg");
     }
 
     // Disable some unix checkers for PS4.
     if (Triple.isPS4CPU()) {
       CmdArgs.push_back("-analyzer-disable-checker=unix.API");
       CmdArgs.push_back("-analyzer-disable-checker=unix.Vfork");
     }
 
     if (Triple.isOSDarwin())
       CmdArgs.push_back("-analyzer-checker=osx");
 
     CmdArgs.push_back("-analyzer-checker=deadcode");
 
     if (types::isCXX(Input.getType()))
       CmdArgs.push_back("-analyzer-checker=cplusplus");
 
     if (!Triple.isPS4CPU()) {
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.UncheckedReturn");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.getpw");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.gets");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.mktemp");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.mkstemp");
       CmdArgs.push_back("-analyzer-checker=security.insecureAPI.vfork");
     }
 
     // Default nullability checks.
     CmdArgs.push_back("-analyzer-checker=nullability.NullPassedToNonnull");
     CmdArgs.push_back("-analyzer-checker=nullability.NullReturnedFromNonnull");
   }
 
   // Set the output format. The default is plist, for (lame) historical reasons.
   CmdArgs.push_back("-analyzer-output");
   if (Arg *A = Args.getLastArg(options::OPT__analyzer_output))
     CmdArgs.push_back(A->getValue());
   else
     CmdArgs.push_back("plist");
 
   // Disable the presentation of standard compiler warnings when using
   // --analyze.  We only want to show static analyzer diagnostics or frontend
   // errors.
   CmdArgs.push_back("-w");
 
   // Add -Xanalyzer arguments when running as analyzer.
   Args.AddAllArgValues(CmdArgs, options::OPT_Xanalyzer);
 }
 
 static void RenderSSPOptions(const ToolChain &TC, const ArgList &Args,
                              ArgStringList &CmdArgs, bool KernelOrKext) {
   const llvm::Triple &EffectiveTriple = TC.getEffectiveTriple();
 
   // NVPTX doesn't support stack protectors; from the compiler's perspective, it
   // doesn't even have a stack!
   if (EffectiveTriple.isNVPTX())
     return;
 
   // -stack-protector=0 is default.
   unsigned StackProtectorLevel = 0;
   unsigned DefaultStackProtectorLevel =
       TC.GetDefaultStackProtectorLevel(KernelOrKext);
 
   if (Arg *A = Args.getLastArg(options::OPT_fno_stack_protector,
                                options::OPT_fstack_protector_all,
                                options::OPT_fstack_protector_strong,
                                options::OPT_fstack_protector)) {
     if (A->getOption().matches(options::OPT_fstack_protector))
       StackProtectorLevel =
           std::max<unsigned>(LangOptions::SSPOn, DefaultStackProtectorLevel);
     else if (A->getOption().matches(options::OPT_fstack_protector_strong))
       StackProtectorLevel = LangOptions::SSPStrong;
     else if (A->getOption().matches(options::OPT_fstack_protector_all))
       StackProtectorLevel = LangOptions::SSPReq;
   } else {
     StackProtectorLevel = DefaultStackProtectorLevel;
   }
 
   if (StackProtectorLevel) {
     CmdArgs.push_back("-stack-protector");
     CmdArgs.push_back(Args.MakeArgString(Twine(StackProtectorLevel)));
   }
 
   // --param ssp-buffer-size=
   for (const Arg *A : Args.filtered(options::OPT__param)) {
     StringRef Str(A->getValue());
     if (Str.startswith("ssp-buffer-size=")) {
       if (StackProtectorLevel) {
         CmdArgs.push_back("-stack-protector-buffer-size");
         // FIXME: Verify the argument is a valid integer.
         CmdArgs.push_back(Args.MakeArgString(Str.drop_front(16)));
       }
       A->claim();
     }
   }
 }
 
 static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs) {
   const unsigned ForwardedArguments[] = {
       options::OPT_cl_opt_disable,
       options::OPT_cl_strict_aliasing,
       options::OPT_cl_single_precision_constant,
       options::OPT_cl_finite_math_only,
       options::OPT_cl_kernel_arg_info,
       options::OPT_cl_unsafe_math_optimizations,
       options::OPT_cl_fast_relaxed_math,
       options::OPT_cl_mad_enable,
       options::OPT_cl_no_signed_zeros,
       options::OPT_cl_denorms_are_zero,
       options::OPT_cl_fp32_correctly_rounded_divide_sqrt,
   };
 
   if (Arg *A = Args.getLastArg(options::OPT_cl_std_EQ)) {
     std::string CLStdStr = std::string("-cl-std=") + A->getValue();
     CmdArgs.push_back(Args.MakeArgString(CLStdStr));
   }
 
   for (const auto &Arg : ForwardedArguments)
     if (const auto *A = Args.getLastArg(Arg))
       CmdArgs.push_back(Args.MakeArgString(A->getOption().getPrefixedName()));
 }
 
 static void RenderARCMigrateToolOptions(const Driver &D, const ArgList &Args,
                                         ArgStringList &CmdArgs) {
   bool ARCMTEnabled = false;
   if (!Args.hasArg(options::OPT_fno_objc_arc, options::OPT_fobjc_arc)) {
     if (const Arg *A = Args.getLastArg(options::OPT_ccc_arcmt_check,
                                        options::OPT_ccc_arcmt_modify,
                                        options::OPT_ccc_arcmt_migrate)) {
       ARCMTEnabled = true;
       switch (A->getOption().getID()) {
       default: llvm_unreachable("missed a case");
       case options::OPT_ccc_arcmt_check:
         CmdArgs.push_back("-arcmt-check");
         break;
       case options::OPT_ccc_arcmt_modify:
         CmdArgs.push_back("-arcmt-modify");
         break;
       case options::OPT_ccc_arcmt_migrate:
         CmdArgs.push_back("-arcmt-migrate");
         CmdArgs.push_back("-mt-migrate-directory");
         CmdArgs.push_back(A->getValue());
 
         Args.AddLastArg(CmdArgs, options::OPT_arcmt_migrate_report_output);
         Args.AddLastArg(CmdArgs, options::OPT_arcmt_migrate_emit_arc_errors);
         break;
       }
     }
   } else {
     Args.ClaimAllArgs(options::OPT_ccc_arcmt_check);
     Args.ClaimAllArgs(options::OPT_ccc_arcmt_modify);
     Args.ClaimAllArgs(options::OPT_ccc_arcmt_migrate);
   }
 
   if (const Arg *A = Args.getLastArg(options::OPT_ccc_objcmt_migrate)) {
     if (ARCMTEnabled)
       D.Diag(diag::err_drv_argument_not_allowed_with)
           << A->getAsString(Args) << "-ccc-arcmt-migrate";
 
     CmdArgs.push_back("-mt-migrate-directory");
     CmdArgs.push_back(A->getValue());
 
     if (!Args.hasArg(options::OPT_objcmt_migrate_literals,
                      options::OPT_objcmt_migrate_subscripting,
                      options::OPT_objcmt_migrate_property)) {
       // None specified, means enable them all.
       CmdArgs.push_back("-objcmt-migrate-literals");
       CmdArgs.push_back("-objcmt-migrate-subscripting");
       CmdArgs.push_back("-objcmt-migrate-property");
     } else {
       Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_literals);
       Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_subscripting);
       Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_property);
     }
   } else {
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_literals);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_subscripting);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_property);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_all);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_readonly_property);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_readwrite_property);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_property_dot_syntax);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_annotation);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_instancetype);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_nsmacros);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_protocol_conformance);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_atomic_property);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_returns_innerpointer_property);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_ns_nonatomic_iosonly);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_migrate_designated_init);
     Args.AddLastArg(CmdArgs, options::OPT_objcmt_whitelist_dir_path);
   }
 }
 
 static void RenderBuiltinOptions(const ToolChain &TC, const llvm::Triple &T,
                                  const ArgList &Args, ArgStringList &CmdArgs) {
   // -fbuiltin is default unless -mkernel is used.
   bool UseBuiltins =
       Args.hasFlag(options::OPT_fbuiltin, options::OPT_fno_builtin,
                    !Args.hasArg(options::OPT_mkernel));
   if (!UseBuiltins)
     CmdArgs.push_back("-fno-builtin");
 
   // -ffreestanding implies -fno-builtin.
   if (Args.hasArg(options::OPT_ffreestanding))
     UseBuiltins = false;
 
   // Process the -fno-builtin-* options.
   for (const auto &Arg : Args) {
     const Option &O = Arg->getOption();
     if (!O.matches(options::OPT_fno_builtin_))
       continue;
 
     Arg->claim();
 
     // If -fno-builtin is specified, then there's no need to pass the option to
     // the frontend.
     if (!UseBuiltins)
       continue;
 
     StringRef FuncName = Arg->getValue();
     CmdArgs.push_back(Args.MakeArgString("-fno-builtin-" + FuncName));
   }
 
   // le32-specific flags:
   //  -fno-math-builtin: clang should not convert math builtins to intrinsics
   //                     by default.
   if (TC.getArch() == llvm::Triple::le32)
     CmdArgs.push_back("-fno-math-builtin");
 }
 
 static void RenderModulesOptions(Compilation &C, const Driver &D,
                                  const ArgList &Args, const InputInfo &Input,
                                  const InputInfo &Output,
                                  ArgStringList &CmdArgs, bool &HaveModules) {
   // -fmodules enables the use of precompiled modules (off by default).
   // Users can pass -fno-cxx-modules to turn off modules support for
   // C++/Objective-C++ programs.
   bool HaveClangModules = false;
   if (Args.hasFlag(options::OPT_fmodules, options::OPT_fno_modules, false)) {
     bool AllowedInCXX = Args.hasFlag(options::OPT_fcxx_modules,
                                      options::OPT_fno_cxx_modules, true);
     if (AllowedInCXX || !types::isCXX(Input.getType())) {
       CmdArgs.push_back("-fmodules");
       HaveClangModules = true;
     }
   }
 
   HaveModules = HaveClangModules;
   if (Args.hasArg(options::OPT_fmodules_ts)) {
     CmdArgs.push_back("-fmodules-ts");
     HaveModules = true;
   }
 
   // -fmodule-maps enables implicit reading of module map files. By default,
   // this is enabled if we are using Clang's flavor of precompiled modules.
   if (Args.hasFlag(options::OPT_fimplicit_module_maps,
                    options::OPT_fno_implicit_module_maps, HaveClangModules))
     CmdArgs.push_back("-fimplicit-module-maps");
 
   // -fmodules-decluse checks that modules used are declared so (off by default)
   if (Args.hasFlag(options::OPT_fmodules_decluse,
                    options::OPT_fno_modules_decluse, false))
     CmdArgs.push_back("-fmodules-decluse");
 
   // -fmodules-strict-decluse is like -fmodule-decluse, but also checks that
   // all #included headers are part of modules.
   if (Args.hasFlag(options::OPT_fmodules_strict_decluse,
                    options::OPT_fno_modules_strict_decluse, false))
     CmdArgs.push_back("-fmodules-strict-decluse");
 
   // -fno-implicit-modules turns off implicitly compiling modules on demand.
   if (!Args.hasFlag(options::OPT_fimplicit_modules,
                     options::OPT_fno_implicit_modules, HaveClangModules)) {
     if (HaveModules)
       CmdArgs.push_back("-fno-implicit-modules");
   } else if (HaveModules) {
     // -fmodule-cache-path specifies where our implicitly-built module files
     // should be written.
     SmallString<128> Path;
     if (Arg *A = Args.getLastArg(options::OPT_fmodules_cache_path))
       Path = A->getValue();
 
     if (C.isForDiagnostics()) {
       // When generating crash reports, we want to emit the modules along with
       // the reproduction sources, so we ignore any provided module path.
       Path = Output.getFilename();
       llvm::sys::path::replace_extension(Path, ".cache");
       llvm::sys::path::append(Path, "modules");
     } else if (Path.empty()) {
       // No module path was provided: use the default.
       llvm::sys::path::system_temp_directory(/*erasedOnReboot=*/false, Path);
       llvm::sys::path::append(Path, "org.llvm.clang.");
       appendUserToPath(Path);
       llvm::sys::path::append(Path, "ModuleCache");
     }
 
     const char Arg[] = "-fmodules-cache-path=";
     Path.insert(Path.begin(), Arg, Arg + strlen(Arg));
     CmdArgs.push_back(Args.MakeArgString(Path));
   }
 
   if (HaveModules) {
     // -fprebuilt-module-path specifies where to load the prebuilt module files.
     for (const Arg *A : Args.filtered(options::OPT_fprebuilt_module_path)) {
       CmdArgs.push_back(Args.MakeArgString(
           std::string("-fprebuilt-module-path=") + A->getValue()));
       A->claim();
     }
   }
 
   // -fmodule-name specifies the module that is currently being built (or
   // used for header checking by -fmodule-maps).
   Args.AddLastArg(CmdArgs, options::OPT_fmodule_name_EQ);
 
   // -fmodule-map-file can be used to specify files containing module
   // definitions.
   Args.AddAllArgs(CmdArgs, options::OPT_fmodule_map_file);
 
   // -fbuiltin-module-map can be used to load the clang
   // builtin headers modulemap file.
   if (Args.hasArg(options::OPT_fbuiltin_module_map)) {
     SmallString<128> BuiltinModuleMap(D.ResourceDir);
     llvm::sys::path::append(BuiltinModuleMap, "include");
     llvm::sys::path::append(BuiltinModuleMap, "module.modulemap");
     if (llvm::sys::fs::exists(BuiltinModuleMap))
       CmdArgs.push_back(
           Args.MakeArgString("-fmodule-map-file=" + BuiltinModuleMap));
   }
 
   // The -fmodule-file=<name>=<file> form specifies the mapping of module
   // names to precompiled module files (the module is loaded only if used).
   // The -fmodule-file=<file> form can be used to unconditionally load
   // precompiled module files (whether used or not).
   if (HaveModules)
     Args.AddAllArgs(CmdArgs, options::OPT_fmodule_file);
   else
     Args.ClaimAllArgs(options::OPT_fmodule_file);
 
   // When building modules and generating crashdumps, we need to dump a module
   // dependency VFS alongside the output.
   if (HaveClangModules && C.isForDiagnostics()) {
     SmallString<128> VFSDir(Output.getFilename());
     llvm::sys::path::replace_extension(VFSDir, ".cache");
     // Add the cache directory as a temp so the crash diagnostics pick it up.
     C.addTempFile(Args.MakeArgString(VFSDir));
 
     llvm::sys::path::append(VFSDir, "vfs");
     CmdArgs.push_back("-module-dependency-dir");
     CmdArgs.push_back(Args.MakeArgString(VFSDir));
   }
 
   if (HaveClangModules)
     Args.AddLastArg(CmdArgs, options::OPT_fmodules_user_build_path);
 
   // Pass through all -fmodules-ignore-macro arguments.
   Args.AddAllArgs(CmdArgs, options::OPT_fmodules_ignore_macro);
   Args.AddLastArg(CmdArgs, options::OPT_fmodules_prune_interval);
   Args.AddLastArg(CmdArgs, options::OPT_fmodules_prune_after);
 
   Args.AddLastArg(CmdArgs, options::OPT_fbuild_session_timestamp);
 
   if (Arg *A = Args.getLastArg(options::OPT_fbuild_session_file)) {
     if (Args.hasArg(options::OPT_fbuild_session_timestamp))
       D.Diag(diag::err_drv_argument_not_allowed_with)
           << A->getAsString(Args) << "-fbuild-session-timestamp";
 
     llvm::sys::fs::file_status Status;
     if (llvm::sys::fs::status(A->getValue(), Status))
       D.Diag(diag::err_drv_no_such_file) << A->getValue();
     CmdArgs.push_back(
         Args.MakeArgString("-fbuild-session-timestamp=" +
                            Twine((uint64_t)Status.getLastModificationTime()
                                      .time_since_epoch()
                                      .count())));
   }
 
   if (Args.getLastArg(options::OPT_fmodules_validate_once_per_build_session)) {
     if (!Args.getLastArg(options::OPT_fbuild_session_timestamp,
                          options::OPT_fbuild_session_file))
       D.Diag(diag::err_drv_modules_validate_once_requires_timestamp);
 
     Args.AddLastArg(CmdArgs,
                     options::OPT_fmodules_validate_once_per_build_session);
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_fmodules_validate_system_headers);
   Args.AddLastArg(CmdArgs, options::OPT_fmodules_disable_diagnostic_validation);
 }
 
 static void RenderCharacterOptions(const ArgList &Args, const llvm::Triple &T,
                                    ArgStringList &CmdArgs) {
   // -fsigned-char is default.
   if (const Arg *A = Args.getLastArg(options::OPT_fsigned_char,
                                      options::OPT_fno_signed_char,
                                      options::OPT_funsigned_char,
                                      options::OPT_fno_unsigned_char)) {
     if (A->getOption().matches(options::OPT_funsigned_char) ||
         A->getOption().matches(options::OPT_fno_signed_char)) {
       CmdArgs.push_back("-fno-signed-char");
     }
   } else if (!isSignedCharDefault(T)) {
     CmdArgs.push_back("-fno-signed-char");
   }
 
   if (const Arg *A = Args.getLastArg(options::OPT_fshort_wchar,
                                      options::OPT_fno_short_wchar)) {
     if (A->getOption().matches(options::OPT_fshort_wchar)) {
       CmdArgs.push_back("-fwchar-type=short");
       CmdArgs.push_back("-fno-signed-wchar");
     } else {
       bool IsARM = T.isARM() || T.isThumb() || T.isAArch64();
       CmdArgs.push_back("-fwchar-type=int");
       if (IsARM && !(T.isOSWindows() || T.getOS() == llvm::Triple::NetBSD ||
                      T.getOS() == llvm::Triple::OpenBSD))
         CmdArgs.push_back("-fno-signed-wchar");
       else
         CmdArgs.push_back("-fsigned-wchar");
     }
   }
 }
 
 static void RenderObjCOptions(const ToolChain &TC, const Driver &D,
                               const llvm::Triple &T, const ArgList &Args,
                               ObjCRuntime &Runtime, bool InferCovariantReturns,
                               const InputInfo &Input, ArgStringList &CmdArgs) {
   const llvm::Triple::ArchType Arch = TC.getArch();
 
   // -fobjc-dispatch-method is only relevant with the nonfragile-abi, and legacy
   // is the default. Except for deployment target of 10.5, next runtime is
   // always legacy dispatch and -fno-objc-legacy-dispatch gets ignored silently.
   if (Runtime.isNonFragile()) {
     if (!Args.hasFlag(options::OPT_fobjc_legacy_dispatch,
                       options::OPT_fno_objc_legacy_dispatch,
                       Runtime.isLegacyDispatchDefaultForArch(Arch))) {
       if (TC.UseObjCMixedDispatch())
         CmdArgs.push_back("-fobjc-dispatch-method=mixed");
       else
         CmdArgs.push_back("-fobjc-dispatch-method=non-legacy");
     }
   }
 
   // When ObjectiveC legacy runtime is in effect on MacOSX, turn on the option
   // to do Array/Dictionary subscripting by default.
   if (Arch == llvm::Triple::x86 && T.isMacOSX() &&
       !T.isMacOSXVersionLT(10, 7) &&
       Runtime.getKind() == ObjCRuntime::FragileMacOSX && Runtime.isNeXTFamily())
     CmdArgs.push_back("-fobjc-subscripting-legacy-runtime");
 
   // Allow -fno-objc-arr to trump -fobjc-arr/-fobjc-arc.
   // NOTE: This logic is duplicated in ToolChains.cpp.
   if (isObjCAutoRefCount(Args)) {
     TC.CheckObjCARC();
 
     CmdArgs.push_back("-fobjc-arc");
 
     // FIXME: It seems like this entire block, and several around it should be
     // wrapped in isObjC, but for now we just use it here as this is where it
     // was being used previously.
     if (types::isCXX(Input.getType()) && types::isObjC(Input.getType())) {
       if (TC.GetCXXStdlibType(Args) == ToolChain::CST_Libcxx)
         CmdArgs.push_back("-fobjc-arc-cxxlib=libc++");
       else
         CmdArgs.push_back("-fobjc-arc-cxxlib=libstdc++");
     }
 
     // Allow the user to enable full exceptions code emission.
     // We default off for Objective-C, on for Objective-C++.
     if (Args.hasFlag(options::OPT_fobjc_arc_exceptions,
                      options::OPT_fno_objc_arc_exceptions,
                      /*default=*/types::isCXX(Input.getType())))
       CmdArgs.push_back("-fobjc-arc-exceptions");
   }
 
   // Silence warning for full exception code emission options when explicitly
   // set to use no ARC.
   if (Args.hasArg(options::OPT_fno_objc_arc)) {
     Args.ClaimAllArgs(options::OPT_fobjc_arc_exceptions);
     Args.ClaimAllArgs(options::OPT_fno_objc_arc_exceptions);
   }
 
   // -fobjc-infer-related-result-type is the default, except in the Objective-C
   // rewriter.
   if (InferCovariantReturns)
     CmdArgs.push_back("-fno-objc-infer-related-result-type");
 
   // Pass down -fobjc-weak or -fno-objc-weak if present.
   if (types::isObjC(Input.getType())) {
     auto WeakArg =
         Args.getLastArg(options::OPT_fobjc_weak, options::OPT_fno_objc_weak);
     if (!WeakArg) {
       // nothing to do
     } else if (!Runtime.allowsWeak()) {
       if (WeakArg->getOption().matches(options::OPT_fobjc_weak))
         D.Diag(diag::err_objc_weak_unsupported);
     } else {
       WeakArg->render(Args, CmdArgs);
     }
   }
 }
 
 static void RenderDiagnosticsOptions(const Driver &D, const ArgList &Args,
                                      ArgStringList &CmdArgs) {
   bool CaretDefault = true;
   bool ColumnDefault = true;
 
   if (const Arg *A = Args.getLastArg(options::OPT__SLASH_diagnostics_classic,
                                      options::OPT__SLASH_diagnostics_column,
                                      options::OPT__SLASH_diagnostics_caret)) {
     switch (A->getOption().getID()) {
     case options::OPT__SLASH_diagnostics_caret:
       CaretDefault = true;
       ColumnDefault = true;
       break;
     case options::OPT__SLASH_diagnostics_column:
       CaretDefault = false;
       ColumnDefault = true;
       break;
     case options::OPT__SLASH_diagnostics_classic:
       CaretDefault = false;
       ColumnDefault = false;
       break;
     }
   }
 
   // -fcaret-diagnostics is default.
   if (!Args.hasFlag(options::OPT_fcaret_diagnostics,
                     options::OPT_fno_caret_diagnostics, CaretDefault))
     CmdArgs.push_back("-fno-caret-diagnostics");
 
   // -fdiagnostics-fixit-info is default, only pass non-default.
   if (!Args.hasFlag(options::OPT_fdiagnostics_fixit_info,
                     options::OPT_fno_diagnostics_fixit_info))
     CmdArgs.push_back("-fno-diagnostics-fixit-info");
 
   // Enable -fdiagnostics-show-option by default.
   if (Args.hasFlag(options::OPT_fdiagnostics_show_option,
                    options::OPT_fno_diagnostics_show_option))
     CmdArgs.push_back("-fdiagnostics-show-option");
 
   if (const Arg *A =
           Args.getLastArg(options::OPT_fdiagnostics_show_category_EQ)) {
     CmdArgs.push_back("-fdiagnostics-show-category");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Args.hasFlag(options::OPT_fdiagnostics_show_hotness,
                    options::OPT_fno_diagnostics_show_hotness, false))
     CmdArgs.push_back("-fdiagnostics-show-hotness");
 
   if (const Arg *A =
           Args.getLastArg(options::OPT_fdiagnostics_hotness_threshold_EQ)) {
     std::string Opt =
         std::string("-fdiagnostics-hotness-threshold=") + A->getValue();
     CmdArgs.push_back(Args.MakeArgString(Opt));
   }
 
   if (const Arg *A = Args.getLastArg(options::OPT_fdiagnostics_format_EQ)) {
     CmdArgs.push_back("-fdiagnostics-format");
     CmdArgs.push_back(A->getValue());
   }
 
   if (const Arg *A = Args.getLastArg(
           options::OPT_fdiagnostics_show_note_include_stack,
           options::OPT_fno_diagnostics_show_note_include_stack)) {
     const Option &O = A->getOption();
     if (O.matches(options::OPT_fdiagnostics_show_note_include_stack))
       CmdArgs.push_back("-fdiagnostics-show-note-include-stack");
     else
       CmdArgs.push_back("-fno-diagnostics-show-note-include-stack");
   }
 
   // Color diagnostics are parsed by the driver directly from argv and later
   // re-parsed to construct this job; claim any possible color diagnostic here
   // to avoid warn_drv_unused_argument and diagnose bad
   // OPT_fdiagnostics_color_EQ values.
   for (const Arg *A : Args) {
     const Option &O = A->getOption();
     if (!O.matches(options::OPT_fcolor_diagnostics) &&
         !O.matches(options::OPT_fdiagnostics_color) &&
         !O.matches(options::OPT_fno_color_diagnostics) &&
         !O.matches(options::OPT_fno_diagnostics_color) &&
         !O.matches(options::OPT_fdiagnostics_color_EQ))
       continue;
 
     if (O.matches(options::OPT_fdiagnostics_color_EQ)) {
       StringRef Value(A->getValue());
       if (Value != "always" && Value != "never" && Value != "auto")
         D.Diag(diag::err_drv_clang_unsupported)
             << ("-fdiagnostics-color=" + Value).str();
     }
     A->claim();
   }
 
   if (D.getDiags().getDiagnosticOptions().ShowColors)
     CmdArgs.push_back("-fcolor-diagnostics");
 
   if (Args.hasArg(options::OPT_fansi_escape_codes))
     CmdArgs.push_back("-fansi-escape-codes");
 
   if (!Args.hasFlag(options::OPT_fshow_source_location,
                     options::OPT_fno_show_source_location))
     CmdArgs.push_back("-fno-show-source-location");
 
   if (Args.hasArg(options::OPT_fdiagnostics_absolute_paths))
     CmdArgs.push_back("-fdiagnostics-absolute-paths");
 
   if (!Args.hasFlag(options::OPT_fshow_column, options::OPT_fno_show_column,
                     ColumnDefault))
     CmdArgs.push_back("-fno-show-column");
 
   if (!Args.hasFlag(options::OPT_fspell_checking,
                     options::OPT_fno_spell_checking))
     CmdArgs.push_back("-fno-spell-checking");
 }
 
 static void RenderDebugOptions(const ToolChain &TC, const Driver &D,
                                const llvm::Triple &T, const ArgList &Args,
                                bool EmitCodeView, bool IsWindowsMSVC,
                                ArgStringList &CmdArgs,
                                codegenoptions::DebugInfoKind &DebugInfoKind,
                                const Arg *&SplitDWARFArg) {
   if (Args.hasFlag(options::OPT_fdebug_info_for_profiling,
                    options::OPT_fno_debug_info_for_profiling, false))
     CmdArgs.push_back("-fdebug-info-for-profiling");
 
   // The 'g' groups options involve a somewhat intricate sequence of decisions
   // about what to pass from the driver to the frontend, but by the time they
   // reach cc1 they've been factored into three well-defined orthogonal choices:
   //  * what level of debug info to generate
   //  * what dwarf version to write
   //  * what debugger tuning to use
   // This avoids having to monkey around further in cc1 other than to disable
   // codeview if not running in a Windows environment. Perhaps even that
   // decision should be made in the driver as well though.
   unsigned DWARFVersion = 0;
   llvm::DebuggerKind DebuggerTuning = TC.getDefaultDebuggerTuning();
 
   bool SplitDWARFInlining =
       Args.hasFlag(options::OPT_fsplit_dwarf_inlining,
                    options::OPT_fno_split_dwarf_inlining, true);
 
   Args.ClaimAllArgs(options::OPT_g_Group);
 
   SplitDWARFArg = Args.getLastArg(options::OPT_gsplit_dwarf);
 
   if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) {
     // If the last option explicitly specified a debug-info level, use it.
     if (A->getOption().matches(options::OPT_gN_Group)) {
       DebugInfoKind = DebugLevelToInfoKind(*A);
       // If you say "-gsplit-dwarf -gline-tables-only", -gsplit-dwarf loses.
       // But -gsplit-dwarf is not a g_group option, hence we have to check the
       // order explicitly. If -gsplit-dwarf wins, we fix DebugInfoKind later.
       // This gets a bit more complicated if you've disabled inline info in the
       // skeleton CUs (SplitDWARFInlining) - then there's value in composing
       // split-dwarf and line-tables-only, so let those compose naturally in
       // that case.
       // And if you just turned off debug info, (-gsplit-dwarf -g0) - do that.
       if (SplitDWARFArg) {
         if (A->getIndex() > SplitDWARFArg->getIndex()) {
           if (DebugInfoKind == codegenoptions::NoDebugInfo ||
               (DebugInfoKind == codegenoptions::DebugLineTablesOnly &&
                SplitDWARFInlining))
             SplitDWARFArg = nullptr;
         } else if (SplitDWARFInlining)
           DebugInfoKind = codegenoptions::NoDebugInfo;
       }
     } else {
       // For any other 'g' option, use Limited.
       DebugInfoKind = codegenoptions::LimitedDebugInfo;
     }
   }
 
   // If a debugger tuning argument appeared, remember it.
   if (const Arg *A =
           Args.getLastArg(options::OPT_gTune_Group, options::OPT_ggdbN_Group)) {
     if (A->getOption().matches(options::OPT_glldb))
       DebuggerTuning = llvm::DebuggerKind::LLDB;
     else if (A->getOption().matches(options::OPT_gsce))
       DebuggerTuning = llvm::DebuggerKind::SCE;
     else
       DebuggerTuning = llvm::DebuggerKind::GDB;
   }
 
   // If a -gdwarf argument appeared, remember it.
   if (const Arg *A =
           Args.getLastArg(options::OPT_gdwarf_2, options::OPT_gdwarf_3,
                           options::OPT_gdwarf_4, options::OPT_gdwarf_5))
     DWARFVersion = DwarfVersionNum(A->getSpelling());
 
   // Forward -gcodeview. EmitCodeView might have been set by CL-compatibility
   // argument parsing.
   if (Args.hasArg(options::OPT_gcodeview) || EmitCodeView) {
     // DWARFVersion remains at 0 if no explicit choice was made.
     CmdArgs.push_back("-gcodeview");
   } else if (DWARFVersion == 0 &&
              DebugInfoKind != codegenoptions::NoDebugInfo) {
     DWARFVersion = TC.GetDefaultDwarfVersion();
   }
 
   // We ignore flag -gstrict-dwarf for now.
   // And we handle flag -grecord-gcc-switches later with DWARFDebugFlags.
   Args.ClaimAllArgs(options::OPT_g_flags_Group);
 
   // Column info is included by default for everything except SCE and CodeView.
   // Clang doesn't track end columns, just starting columns, which, in theory,
   // is fine for CodeView (and PDB).  In practice, however, the Microsoft
   // debuggers don't handle missing end columns well, so it's better not to
   // include any column info.
   if (Args.hasFlag(options::OPT_gcolumn_info, options::OPT_gno_column_info,
                    /*Default=*/!(IsWindowsMSVC && EmitCodeView) &&
                        DebuggerTuning != llvm::DebuggerKind::SCE))
     CmdArgs.push_back("-dwarf-column-info");
 
   // FIXME: Move backend command line options to the module.
   // If -gline-tables-only is the last option it wins.
   if (DebugInfoKind != codegenoptions::DebugLineTablesOnly &&
       Args.hasArg(options::OPT_gmodules)) {
     DebugInfoKind = codegenoptions::LimitedDebugInfo;
     CmdArgs.push_back("-dwarf-ext-refs");
     CmdArgs.push_back("-fmodule-format=obj");
   }
 
   // -gsplit-dwarf should turn on -g and enable the backend dwarf
   // splitting and extraction.
   // FIXME: Currently only works on Linux.
   if (T.isOSLinux()) {
     if (!SplitDWARFInlining)
       CmdArgs.push_back("-fno-split-dwarf-inlining");
 
     if (SplitDWARFArg) {
       if (DebugInfoKind == codegenoptions::NoDebugInfo)
         DebugInfoKind = codegenoptions::LimitedDebugInfo;
       CmdArgs.push_back("-enable-split-dwarf");
     }
   }
 
   // After we've dealt with all combinations of things that could
   // make DebugInfoKind be other than None or DebugLineTablesOnly,
   // figure out if we need to "upgrade" it to standalone debug info.
   // We parse these two '-f' options whether or not they will be used,
   // to claim them even if you wrote "-fstandalone-debug -gline-tables-only"
   bool NeedFullDebug = Args.hasFlag(options::OPT_fstandalone_debug,
                                     options::OPT_fno_standalone_debug,
                                     TC.GetDefaultStandaloneDebug());
   if (DebugInfoKind == codegenoptions::LimitedDebugInfo && NeedFullDebug)
     DebugInfoKind = codegenoptions::FullDebugInfo;
 
   RenderDebugEnablingArgs(Args, CmdArgs, DebugInfoKind, DWARFVersion,
                           DebuggerTuning);
 
   // -fdebug-macro turns on macro debug info generation.
   if (Args.hasFlag(options::OPT_fdebug_macro, options::OPT_fno_debug_macro,
                    false))
     CmdArgs.push_back("-debug-info-macro");
 
   // -ggnu-pubnames turns on gnu style pubnames in the backend.
   if (Args.hasArg(options::OPT_ggnu_pubnames))
     CmdArgs.push_back("-ggnu-pubnames");
 
   // -gdwarf-aranges turns on the emission of the aranges section in the
   // backend.
   // Always enabled for SCE tuning.
   if (Args.hasArg(options::OPT_gdwarf_aranges) ||
       DebuggerTuning == llvm::DebuggerKind::SCE) {
     CmdArgs.push_back("-backend-option");
     CmdArgs.push_back("-generate-arange-section");
   }
 
   if (Args.hasFlag(options::OPT_fdebug_types_section,
                    options::OPT_fno_debug_types_section, false)) {
     CmdArgs.push_back("-backend-option");
     CmdArgs.push_back("-generate-type-units");
   }
 
   // Decide how to render forward declarations of template instantiations.
   // SCE wants full descriptions, others just get them in the name.
   if (DebuggerTuning == llvm::DebuggerKind::SCE)
     CmdArgs.push_back("-debug-forward-template-params");
 
   // Do we need to explicitly import anonymous namespaces into the parent scope?
   if (DebuggerTuning == llvm::DebuggerKind::SCE)
     CmdArgs.push_back("-dwarf-explicit-import");
 
   RenderDebugInfoCompressionArgs(Args, CmdArgs, D);
 }
 
 void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                          const InputInfo &Output, const InputInfoList &Inputs,
                          const ArgList &Args, const char *LinkingOutput) const {
   const llvm::Triple &RawTriple = getToolChain().getTriple();
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
   const std::string &TripleStr = Triple.getTriple();
 
   bool KernelOrKext =
       Args.hasArg(options::OPT_mkernel, options::OPT_fapple_kext);
   const Driver &D = getToolChain().getDriver();
   ArgStringList CmdArgs;
 
   // Check number of inputs for sanity. We need at least one input.
   assert(Inputs.size() >= 1 && "Must have at least one input.");
   const InputInfo &Input = Inputs[0];
   // CUDA compilation may have multiple inputs (source file + results of
   // device-side compilations). OpenMP device jobs also take the host IR as a
   // second input. All other jobs are expected to have exactly one
   // input.
   bool IsCuda = JA.isOffloading(Action::OFK_Cuda);
   bool IsOpenMPDevice = JA.isDeviceOffloading(Action::OFK_OpenMP);
   assert((IsCuda || (IsOpenMPDevice && Inputs.size() == 2) ||
           Inputs.size() == 1) &&
          "Unable to handle multiple inputs.");
 
   const llvm::Triple *AuxTriple =
       IsCuda ? getToolChain().getAuxTriple() : nullptr;
 
   bool IsWindowsGNU = RawTriple.isWindowsGNUEnvironment();
   bool IsWindowsCygnus = RawTriple.isWindowsCygwinEnvironment();
   bool IsWindowsMSVC = RawTriple.isWindowsMSVCEnvironment();
   bool IsIAMCU = RawTriple.isOSIAMCU();
 
   // Adjust IsWindowsXYZ for CUDA compilations.  Even when compiling in device
   // mode (i.e., getToolchain().getTriple() is NVPTX, not Windows), we need to
   // pass Windows-specific flags to cc1.
   if (IsCuda) {
     IsWindowsMSVC |= AuxTriple && AuxTriple->isWindowsMSVCEnvironment();
     IsWindowsGNU |= AuxTriple && AuxTriple->isWindowsGNUEnvironment();
     IsWindowsCygnus |= AuxTriple && AuxTriple->isWindowsCygwinEnvironment();
   }
 
   // C++ is not supported for IAMCU.
   if (IsIAMCU && types::isCXX(Input.getType()))
     D.Diag(diag::err_drv_clang_unsupported) << "C++ for IAMCU";
 
   // Invoke ourselves in -cc1 mode.
   //
   // FIXME: Implement custom jobs for internal actions.
   CmdArgs.push_back("-cc1");
 
   // Add the "effective" target triple.
   CmdArgs.push_back("-triple");
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
   if (const Arg *MJ = Args.getLastArg(options::OPT_MJ)) {
     DumpCompilationDatabase(C, MJ->getValue(), TripleStr, Output, Input, Args);
     Args.ClaimAllArgs(options::OPT_MJ);
   }
 
   if (IsCuda) {
     // We have to pass the triple of the host if compiling for a CUDA device and
     // vice-versa.
     std::string NormalizedTriple;
     if (JA.isDeviceOffloading(Action::OFK_Cuda))
       NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Host>()
                              ->getTriple()
                              .normalize();
     else
       NormalizedTriple = C.getSingleOffloadToolChain<Action::OFK_Cuda>()
                              ->getTriple()
                              .normalize();
 
     CmdArgs.push_back("-aux-triple");
     CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
   }
 
   if (IsOpenMPDevice) {
     // We have to pass the triple of the host if compiling for an OpenMP device.
     std::string NormalizedTriple =
         C.getSingleOffloadToolChain<Action::OFK_Host>()
             ->getTriple()
             .normalize();
     CmdArgs.push_back("-aux-triple");
     CmdArgs.push_back(Args.MakeArgString(NormalizedTriple));
   }
 
   if (Triple.isOSWindows() && (Triple.getArch() == llvm::Triple::arm ||
                                Triple.getArch() == llvm::Triple::thumb)) {
     unsigned Offset = Triple.getArch() == llvm::Triple::arm ? 4 : 6;
     unsigned Version;
     Triple.getArchName().substr(Offset).getAsInteger(10, Version);
     if (Version < 7)
       D.Diag(diag::err_target_unsupported_arch) << Triple.getArchName()
                                                 << TripleStr;
   }
 
   // Push all default warning arguments that are specific to
   // the given target.  These come before user provided warning options
   // are provided.
   getToolChain().addClangWarningOptions(CmdArgs);
 
   // Select the appropriate action.
   RewriteKind rewriteKind = RK_None;
 
   if (isa<AnalyzeJobAction>(JA)) {
     assert(JA.getType() == types::TY_Plist && "Invalid output type.");
     CmdArgs.push_back("-analyze");
   } else if (isa<MigrateJobAction>(JA)) {
     CmdArgs.push_back("-migrate");
   } else if (isa<PreprocessJobAction>(JA)) {
     if (Output.getType() == types::TY_Dependencies)
       CmdArgs.push_back("-Eonly");
     else {
       CmdArgs.push_back("-E");
       if (Args.hasArg(options::OPT_rewrite_objc) &&
           !Args.hasArg(options::OPT_g_Group))
         CmdArgs.push_back("-P");
     }
   } else if (isa<AssembleJobAction>(JA)) {
     CmdArgs.push_back("-emit-obj");
 
     CollectArgsForIntegratedAssembler(C, Args, CmdArgs, D);
 
     // Also ignore explicit -force_cpusubtype_ALL option.
     (void)Args.hasArg(options::OPT_force__cpusubtype__ALL);
   } else if (isa<PrecompileJobAction>(JA)) {
     // Use PCH if the user requested it.
     bool UsePCH = D.CCCUsePCH;
 
     if (JA.getType() == types::TY_Nothing)
       CmdArgs.push_back("-fsyntax-only");
     else if (JA.getType() == types::TY_ModuleFile)
       CmdArgs.push_back("-emit-module-interface");
     else if (UsePCH)
       CmdArgs.push_back("-emit-pch");
     else
       CmdArgs.push_back("-emit-pth");
   } else if (isa<VerifyPCHJobAction>(JA)) {
     CmdArgs.push_back("-verify-pch");
   } else {
     assert((isa<CompileJobAction>(JA) || isa<BackendJobAction>(JA)) &&
            "Invalid action for clang tool.");
     if (JA.getType() == types::TY_Nothing) {
       CmdArgs.push_back("-fsyntax-only");
     } else if (JA.getType() == types::TY_LLVM_IR ||
                JA.getType() == types::TY_LTO_IR) {
       CmdArgs.push_back("-emit-llvm");
     } else if (JA.getType() == types::TY_LLVM_BC ||
                JA.getType() == types::TY_LTO_BC) {
       CmdArgs.push_back("-emit-llvm-bc");
     } else if (JA.getType() == types::TY_PP_Asm) {
       CmdArgs.push_back("-S");
     } else if (JA.getType() == types::TY_AST) {
       CmdArgs.push_back("-emit-pch");
     } else if (JA.getType() == types::TY_ModuleFile) {
       CmdArgs.push_back("-module-file-info");
     } else if (JA.getType() == types::TY_RewrittenObjC) {
       CmdArgs.push_back("-rewrite-objc");
       rewriteKind = RK_NonFragile;
     } else if (JA.getType() == types::TY_RewrittenLegacyObjC) {
       CmdArgs.push_back("-rewrite-objc");
       rewriteKind = RK_Fragile;
     } else {
       assert(JA.getType() == types::TY_PP_Asm && "Unexpected output type!");
     }
 
     // Preserve use-list order by default when emitting bitcode, so that
     // loading the bitcode up in 'opt' or 'llc' and running passes gives the
     // same result as running passes here.  For LTO, we don't need to preserve
     // the use-list order, since serialization to bitcode is part of the flow.
     if (JA.getType() == types::TY_LLVM_BC)
       CmdArgs.push_back("-emit-llvm-uselists");
 
     if (D.isUsingLTO()) {
       Args.AddLastArg(CmdArgs, options::OPT_flto, options::OPT_flto_EQ);
 
       // The Darwin and PS4 linkers currently use the legacy LTO API, which
       // does not support LTO unit features (CFI, whole program vtable opt)
       // under ThinLTO.
       if (!(RawTriple.isOSDarwin() || RawTriple.isPS4()) ||
           D.getLTOMode() == LTOK_Full)
         CmdArgs.push_back("-flto-unit");
     }
   }
 
   if (const Arg *A = Args.getLastArg(options::OPT_fthinlto_index_EQ)) {
     if (!types::isLLVMIR(Input.getType()))
       D.Diag(diag::err_drv_argument_only_allowed_with) << A->getAsString(Args)
                                                        << "-x ir";
     Args.AddLastArg(CmdArgs, options::OPT_fthinlto_index_EQ);
   }
 
   // Embed-bitcode option.
   if (C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO() &&
       (isa<BackendJobAction>(JA) || isa<AssembleJobAction>(JA))) {
     // Add flags implied by -fembed-bitcode.
     Args.AddLastArg(CmdArgs, options::OPT_fembed_bitcode_EQ);
     // Disable all llvm IR level optimizations.
     CmdArgs.push_back("-disable-llvm-passes");
   }
   if (C.getDriver().embedBitcodeMarkerOnly() && !C.getDriver().isUsingLTO())
     CmdArgs.push_back("-fembed-bitcode=marker");
 
   // We normally speed up the clang process a bit by skipping destructors at
   // exit, but when we're generating diagnostics we can rely on some of the
   // cleanup.
   if (!C.isForDiagnostics())
     CmdArgs.push_back("-disable-free");
 
 // Disable the verification pass in -asserts builds.
 #ifdef NDEBUG
   CmdArgs.push_back("-disable-llvm-verifier");
   // Discard LLVM value names in -asserts builds.
   CmdArgs.push_back("-discard-value-names");
 #endif
 
   // Set the main file name, so that debug info works even with
   // -save-temps.
   CmdArgs.push_back("-main-file-name");
   CmdArgs.push_back(getBaseInputName(Args, Input));
 
   // Some flags which affect the language (via preprocessor
   // defines).
   if (Args.hasArg(options::OPT_static))
     CmdArgs.push_back("-static-define");
 
   if (isa<AnalyzeJobAction>(JA))
     RenderAnalyzerOptions(Args, CmdArgs, Triple, Input);
 
   CheckCodeGenerationOptions(D, Args);
 
   llvm::Reloc::Model RelocationModel;
   unsigned PICLevel;
   bool IsPIE;
   std::tie(RelocationModel, PICLevel, IsPIE) =
       ParsePICArgs(getToolChain(), Args);
 
   const char *RMName = RelocationModelName(RelocationModel);
 
   if ((RelocationModel == llvm::Reloc::ROPI ||
        RelocationModel == llvm::Reloc::ROPI_RWPI) &&
       types::isCXX(Input.getType()) &&
       !Args.hasArg(options::OPT_fallow_unsupported))
     D.Diag(diag::err_drv_ropi_incompatible_with_cxx);
 
   if (RMName) {
     CmdArgs.push_back("-mrelocation-model");
     CmdArgs.push_back(RMName);
   }
   if (PICLevel > 0) {
     CmdArgs.push_back("-pic-level");
     CmdArgs.push_back(PICLevel == 1 ? "1" : "2");
     if (IsPIE)
       CmdArgs.push_back("-pic-is-pie");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_meabi)) {
     CmdArgs.push_back("-meabi");
     CmdArgs.push_back(A->getValue());
   }
 
   CmdArgs.push_back("-mthread-model");
   if (Arg *A = Args.getLastArg(options::OPT_mthread_model)) {
     if (!getToolChain().isThreadModelSupported(A->getValue()))
       D.Diag(diag::err_drv_invalid_thread_model_for_target)
           << A->getValue() << A->getAsString(Args);
     CmdArgs.push_back(A->getValue());
   }
   else
     CmdArgs.push_back(Args.MakeArgString(getToolChain().getThreadModel()));
 
   Args.AddLastArg(CmdArgs, options::OPT_fveclib);
 
   if (!Args.hasFlag(options::OPT_fmerge_all_constants,
                     options::OPT_fno_merge_all_constants))
     CmdArgs.push_back("-fno-merge-all-constants");
 
   // LLVM Code Generator Options.
 
   if (Args.hasArg(options::OPT_frewrite_map_file) ||
       Args.hasArg(options::OPT_frewrite_map_file_EQ)) {
     for (const Arg *A : Args.filtered(options::OPT_frewrite_map_file,
                                       options::OPT_frewrite_map_file_EQ)) {
       StringRef Map = A->getValue();
       if (!llvm::sys::fs::exists(Map)) {
         D.Diag(diag::err_drv_no_such_file) << Map;
       } else {
         CmdArgs.push_back("-frewrite-map-file");
         CmdArgs.push_back(A->getValue());
         A->claim();
       }
     }
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_Wframe_larger_than_EQ)) {
     StringRef v = A->getValue();
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back(Args.MakeArgString("-warn-stack-size=" + v));
     A->claim();
   }
 
   if (!Args.hasFlag(options::OPT_fjump_tables, options::OPT_fno_jump_tables,
                     true))
     CmdArgs.push_back("-fno-jump-tables");
 
   if (Args.hasFlag(options::OPT_fprofile_sample_accurate,
                    options::OPT_fno_profile_sample_accurate, false))
     CmdArgs.push_back("-fprofile-sample-accurate");
 
   if (!Args.hasFlag(options::OPT_fpreserve_as_comments,
                     options::OPT_fno_preserve_as_comments, true))
     CmdArgs.push_back("-fno-preserve-as-comments");
 
   if (Arg *A = Args.getLastArg(options::OPT_mregparm_EQ)) {
     CmdArgs.push_back("-mregparm");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fpcc_struct_return,
                                options::OPT_freg_struct_return)) {
     if (getToolChain().getArch() != llvm::Triple::x86) {
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getSpelling() << RawTriple.str();
     } else if (A->getOption().matches(options::OPT_fpcc_struct_return)) {
       CmdArgs.push_back("-fpcc-struct-return");
     } else {
       assert(A->getOption().matches(options::OPT_freg_struct_return));
       CmdArgs.push_back("-freg-struct-return");
     }
   }
 
   if (Args.hasFlag(options::OPT_mrtd, options::OPT_mno_rtd, false))
     CmdArgs.push_back("-fdefault-calling-conv=stdcall");
 
   if (shouldUseFramePointer(Args, RawTriple))
     CmdArgs.push_back("-mdisable-fp-elim");
   if (!Args.hasFlag(options::OPT_fzero_initialized_in_bss,
                     options::OPT_fno_zero_initialized_in_bss))
     CmdArgs.push_back("-mno-zero-initialized-in-bss");
 
   bool OFastEnabled = isOptimizationLevelFast(Args);
   // If -Ofast is the optimization level, then -fstrict-aliasing should be
   // enabled.  This alias option is being used to simplify the hasFlag logic.
   OptSpecifier StrictAliasingAliasOption =
       OFastEnabled ? options::OPT_Ofast : options::OPT_fstrict_aliasing;
   // We turn strict aliasing off by default if we're in CL mode, since MSVC
   // doesn't do any TBAA.
   bool TBAAOnByDefault = !D.IsCLMode();
   if (!Args.hasFlag(options::OPT_fstrict_aliasing, StrictAliasingAliasOption,
                     options::OPT_fno_strict_aliasing, TBAAOnByDefault))
     CmdArgs.push_back("-relaxed-aliasing");
   if (!Args.hasFlag(options::OPT_fstruct_path_tbaa,
                     options::OPT_fno_struct_path_tbaa))
     CmdArgs.push_back("-no-struct-path-tbaa");
   if (Args.hasFlag(options::OPT_fstrict_enums, options::OPT_fno_strict_enums,
                    false))
     CmdArgs.push_back("-fstrict-enums");
   if (!Args.hasFlag(options::OPT_fstrict_return, options::OPT_fno_strict_return,
                     true))
     CmdArgs.push_back("-fno-strict-return");
   if (Args.hasFlag(options::OPT_fallow_editor_placeholders,
                    options::OPT_fno_allow_editor_placeholders, false))
     CmdArgs.push_back("-fallow-editor-placeholders");
   if (Args.hasFlag(options::OPT_fstrict_vtable_pointers,
                    options::OPT_fno_strict_vtable_pointers,
                    false))
     CmdArgs.push_back("-fstrict-vtable-pointers");
   if (!Args.hasFlag(options::OPT_foptimize_sibling_calls,
                     options::OPT_fno_optimize_sibling_calls))
     CmdArgs.push_back("-mdisable-tail-calls");
 
   Args.AddLastArg(CmdArgs, options::OPT_ffine_grained_bitfield_accesses,
                   options::OPT_fno_fine_grained_bitfield_accesses);
 
   // Handle segmented stacks.
   if (Args.hasArg(options::OPT_fsplit_stack))
     CmdArgs.push_back("-split-stacks");
 
   RenderFloatingPointOptions(getToolChain(), D, OFastEnabled, Args, CmdArgs);
 
   // Decide whether to use verbose asm. Verbose assembly is the default on
   // toolchains which have the integrated assembler on by default.
   bool IsIntegratedAssemblerDefault =
       getToolChain().IsIntegratedAssemblerDefault();
   if (Args.hasFlag(options::OPT_fverbose_asm, options::OPT_fno_verbose_asm,
                    IsIntegratedAssemblerDefault) ||
       Args.hasArg(options::OPT_dA))
     CmdArgs.push_back("-masm-verbose");
 
   if (!Args.hasFlag(options::OPT_fintegrated_as, options::OPT_fno_integrated_as,
                     IsIntegratedAssemblerDefault))
     CmdArgs.push_back("-no-integrated-as");
 
   if (Args.hasArg(options::OPT_fdebug_pass_structure)) {
     CmdArgs.push_back("-mdebug-pass");
     CmdArgs.push_back("Structure");
   }
   if (Args.hasArg(options::OPT_fdebug_pass_arguments)) {
     CmdArgs.push_back("-mdebug-pass");
     CmdArgs.push_back("Arguments");
   }
 
   // Enable -mconstructor-aliases except on darwin, where we have to work around
   // a linker bug (see <rdar://problem/7651567>), and CUDA device code, where
   // aliases aren't supported.
   if (!RawTriple.isOSDarwin() && !RawTriple.isNVPTX())
     CmdArgs.push_back("-mconstructor-aliases");
 
   // Darwin's kernel doesn't support guard variables; just die if we
   // try to use them.
   if (KernelOrKext && RawTriple.isOSDarwin())
     CmdArgs.push_back("-fforbid-guard-variables");
 
   if (Args.hasFlag(options::OPT_mms_bitfields, options::OPT_mno_ms_bitfields,
                    false)) {
     CmdArgs.push_back("-mms-bitfields");
   }
 
   if (Args.hasFlag(options::OPT_mpie_copy_relocations,
                    options::OPT_mno_pie_copy_relocations,
                    false)) {
     CmdArgs.push_back("-mpie-copy-relocations");
   }
 
   if (Args.hasFlag(options::OPT_fno_plt, options::OPT_fplt, false)) {
     CmdArgs.push_back("-fno-plt");
   }
 
   // -fhosted is default.
   // TODO: Audit uses of KernelOrKext and see where it'd be more appropriate to
   // use Freestanding.
   bool Freestanding =
       Args.hasFlag(options::OPT_ffreestanding, options::OPT_fhosted, false) ||
       KernelOrKext;
   if (Freestanding)
     CmdArgs.push_back("-ffreestanding");
 
   // This is a coarse approximation of what llvm-gcc actually does, both
   // -fasynchronous-unwind-tables and -fnon-call-exceptions interact in more
   // complicated ways.
   bool AsynchronousUnwindTables =
       Args.hasFlag(options::OPT_fasynchronous_unwind_tables,
                    options::OPT_fno_asynchronous_unwind_tables,
                    (getToolChain().IsUnwindTablesDefault(Args) ||
                     getToolChain().getSanitizerArgs().needsUnwindTables()) &&
                        !Freestanding);
   if (Args.hasFlag(options::OPT_funwind_tables, options::OPT_fno_unwind_tables,
                    AsynchronousUnwindTables))
     CmdArgs.push_back("-munwind-tables");
 
   getToolChain().addClangTargetOptions(Args, CmdArgs,
                                        JA.getOffloadingDeviceKind());
 
   if (Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
     CmdArgs.push_back(A->getValue());
   }
 
   // FIXME: Handle -mtune=.
   (void)Args.hasArg(options::OPT_mtune_EQ);
 
   if (Arg *A = Args.getLastArg(options::OPT_mcmodel_EQ)) {
     CmdArgs.push_back("-mcode-model");
     CmdArgs.push_back(A->getValue());
   }
 
   // Add the target cpu
   std::string CPU = getCPUName(Args, Triple, /*FromAs*/ false);
   if (!CPU.empty()) {
     CmdArgs.push_back("-target-cpu");
     CmdArgs.push_back(Args.MakeArgString(CPU));
   }
 
   RenderTargetOptions(Triple, Args, KernelOrKext, CmdArgs);
 
   // These two are potentially updated by AddClangCLArgs.
   codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;
   bool EmitCodeView = false;
 
   // Add clang-cl arguments.
   types::ID InputType = Input.getType();
   if (D.IsCLMode())
     AddClangCLArgs(Args, InputType, CmdArgs, &DebugInfoKind, &EmitCodeView);
 
   const Arg *SplitDWARFArg = nullptr;
   RenderDebugOptions(getToolChain(), D, RawTriple, Args, EmitCodeView,
                      IsWindowsMSVC, CmdArgs, DebugInfoKind, SplitDWARFArg);
 
   // Add the split debug info name to the command lines here so we
   // can propagate it to the backend.
   bool SplitDWARF = SplitDWARFArg && RawTriple.isOSLinux() &&
                     (isa<AssembleJobAction>(JA) || isa<CompileJobAction>(JA) ||
                      isa<BackendJobAction>(JA));
   const char *SplitDWARFOut;
   if (SplitDWARF) {
     CmdArgs.push_back("-split-dwarf-file");
     SplitDWARFOut = SplitDebugName(Args, Input);
     CmdArgs.push_back(SplitDWARFOut);
   }
 
   // Pass the linker version in use.
   if (Arg *A = Args.getLastArg(options::OPT_mlinker_version_EQ)) {
     CmdArgs.push_back("-target-linker-version");
     CmdArgs.push_back(A->getValue());
   }
 
   if (!shouldUseLeafFramePointer(Args, RawTriple))
     CmdArgs.push_back("-momit-leaf-frame-pointer");
 
   // Explicitly error on some things we know we don't support and can't just
   // ignore.
   if (!Args.hasArg(options::OPT_fallow_unsupported)) {
     Arg *Unsupported;
     if (types::isCXX(InputType) && RawTriple.isOSDarwin() &&
         getToolChain().getArch() == llvm::Triple::x86) {
       if ((Unsupported = Args.getLastArg(options::OPT_fapple_kext)) ||
           (Unsupported = Args.getLastArg(options::OPT_mkernel)))
         D.Diag(diag::err_drv_clang_unsupported_opt_cxx_darwin_i386)
             << Unsupported->getOption().getName();
     }
     // The faltivec option has been superseded by the maltivec option.
     if ((Unsupported = Args.getLastArg(options::OPT_faltivec)))
       D.Diag(diag::err_drv_clang_unsupported_opt_faltivec)
           << Unsupported->getOption().getName()
           << "please use -maltivec and include altivec.h explicitly";
     if ((Unsupported = Args.getLastArg(options::OPT_fno_altivec)))
       D.Diag(diag::err_drv_clang_unsupported_opt_faltivec)
           << Unsupported->getOption().getName() << "please use -mno-altivec";
   }
 
   Args.AddAllArgs(CmdArgs, options::OPT_v);
   Args.AddLastArg(CmdArgs, options::OPT_H);
   if (D.CCPrintHeaders && !D.CCGenDiagnostics) {
     CmdArgs.push_back("-header-include-file");
     CmdArgs.push_back(D.CCPrintHeadersFilename ? D.CCPrintHeadersFilename
                                                : "-");
   }
   Args.AddLastArg(CmdArgs, options::OPT_P);
   Args.AddLastArg(CmdArgs, options::OPT_print_ivar_layout);
 
   if (D.CCLogDiagnostics && !D.CCGenDiagnostics) {
     CmdArgs.push_back("-diagnostic-log-file");
     CmdArgs.push_back(D.CCLogDiagnosticsFilename ? D.CCLogDiagnosticsFilename
                                                  : "-");
   }
 
   bool UseSeparateSections = isUseSeparateSections(Triple);
 
   if (Args.hasFlag(options::OPT_ffunction_sections,
                    options::OPT_fno_function_sections, UseSeparateSections)) {
     CmdArgs.push_back("-ffunction-sections");
   }
 
   if (Args.hasFlag(options::OPT_fdata_sections, options::OPT_fno_data_sections,
                    UseSeparateSections)) {
     CmdArgs.push_back("-fdata-sections");
   }
 
   if (!Args.hasFlag(options::OPT_funique_section_names,
                     options::OPT_fno_unique_section_names, true))
     CmdArgs.push_back("-fno-unique-section-names");
 
   if (auto *A = Args.getLastArg(
       options::OPT_finstrument_functions,
       options::OPT_finstrument_functions_after_inlining,
       options::OPT_finstrument_function_entry_bare))
     A->render(Args, CmdArgs);
 
   addPGOAndCoverageFlags(C, D, Output, Args, CmdArgs);
 
   if (auto *ABICompatArg = Args.getLastArg(options::OPT_fclang_abi_compat_EQ))
     ABICompatArg->render(Args, CmdArgs);
 
   // Add runtime flag for PS4 when PGO or Coverage are enabled.
   if (RawTriple.isPS4CPU())
     PS4cpu::addProfileRTArgs(getToolChain(), Args, CmdArgs);
 
   // Pass options for controlling the default header search paths.
   if (Args.hasArg(options::OPT_nostdinc)) {
     CmdArgs.push_back("-nostdsysteminc");
     CmdArgs.push_back("-nobuiltininc");
   } else {
     if (Args.hasArg(options::OPT_nostdlibinc))
       CmdArgs.push_back("-nostdsysteminc");
     Args.AddLastArg(CmdArgs, options::OPT_nostdincxx);
     Args.AddLastArg(CmdArgs, options::OPT_nobuiltininc);
   }
 
   // Pass the path to compiler resource files.
   CmdArgs.push_back("-resource-dir");
   CmdArgs.push_back(D.ResourceDir.c_str());
 
   Args.AddLastArg(CmdArgs, options::OPT_working_directory);
 
   RenderARCMigrateToolOptions(D, Args, CmdArgs);
 
   // Add preprocessing options like -I, -D, etc. if we are using the
   // preprocessor.
   //
   // FIXME: Support -fpreprocessed
   if (types::getPreprocessedType(InputType) != types::TY_INVALID)
     AddPreprocessingOptions(C, JA, D, Args, CmdArgs, Output, Inputs);
 
   // Don't warn about "clang -c -DPIC -fPIC test.i" because libtool.m4 assumes
   // that "The compiler can only warn and ignore the option if not recognized".
   // When building with ccache, it will pass -D options to clang even on
   // preprocessed inputs and configure concludes that -fPIC is not supported.
   Args.ClaimAllArgs(options::OPT_D);
 
   // Manually translate -O4 to -O3; let clang reject others.
   if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
     if (A->getOption().matches(options::OPT_O4)) {
       CmdArgs.push_back("-O3");
       D.Diag(diag::warn_O4_is_O3);
     } else {
       A->render(Args, CmdArgs);
     }
   }
 
   // Warn about ignored options to clang.
   for (const Arg *A :
        Args.filtered(options::OPT_clang_ignored_gcc_optimization_f_Group)) {
     D.Diag(diag::warn_ignored_gcc_optimization) << A->getAsString(Args);
     A->claim();
   }
 
   for (const Arg *A :
        Args.filtered(options::OPT_clang_ignored_legacy_options_Group)) {
     D.Diag(diag::warn_ignored_clang_option) << A->getAsString(Args);
     A->claim();
   }
 
   claimNoWarnArgs(Args);
 
   Args.AddAllArgs(CmdArgs, options::OPT_R_Group);
 
   Args.AddAllArgs(CmdArgs, options::OPT_W_Group);
   if (Args.hasFlag(options::OPT_pedantic, options::OPT_no_pedantic, false))
     CmdArgs.push_back("-pedantic");
   Args.AddLastArg(CmdArgs, options::OPT_pedantic_errors);
   Args.AddLastArg(CmdArgs, options::OPT_w);
 
   // Handle -{std, ansi, trigraphs} -- take the last of -{std, ansi}
   // (-ansi is equivalent to -std=c89 or -std=c++98).
   //
   // If a std is supplied, only add -trigraphs if it follows the
   // option.
   bool ImplyVCPPCXXVer = false;
   if (Arg *Std = Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi)) {
     if (Std->getOption().matches(options::OPT_ansi))
       if (types::isCXX(InputType))
         CmdArgs.push_back("-std=c++98");
       else
         CmdArgs.push_back("-std=c89");
     else
       Std->render(Args, CmdArgs);
 
     // If -f(no-)trigraphs appears after the language standard flag, honor it.
     if (Arg *A = Args.getLastArg(options::OPT_std_EQ, options::OPT_ansi,
                                  options::OPT_ftrigraphs,
                                  options::OPT_fno_trigraphs))
       if (A != Std)
         A->render(Args, CmdArgs);
   } else {
     // Honor -std-default.
     //
     // FIXME: Clang doesn't correctly handle -std= when the input language
     // doesn't match. For the time being just ignore this for C++ inputs;
     // eventually we want to do all the standard defaulting here instead of
     // splitting it between the driver and clang -cc1.
     if (!types::isCXX(InputType))
       Args.AddAllArgsTranslated(CmdArgs, options::OPT_std_default_EQ, "-std=",
                                 /*Joined=*/true);
     else if (IsWindowsMSVC)
       ImplyVCPPCXXVer = true;
 
     Args.AddLastArg(CmdArgs, options::OPT_ftrigraphs,
                     options::OPT_fno_trigraphs);
   }
 
   // GCC's behavior for -Wwrite-strings is a bit strange:
   //  * In C, this "warning flag" changes the types of string literals from
   //    'char[N]' to 'const char[N]', and thus triggers an unrelated warning
   //    for the discarded qualifier.
   //  * In C++, this is just a normal warning flag.
   //
   // Implementing this warning correctly in C is hard, so we follow GCC's
   // behavior for now. FIXME: Directly diagnose uses of a string literal as
   // a non-const char* in C, rather than using this crude hack.
   if (!types::isCXX(InputType)) {
     // FIXME: This should behave just like a warning flag, and thus should also
     // respect -Weverything, -Wno-everything, -Werror=write-strings, and so on.
     Arg *WriteStrings =
         Args.getLastArg(options::OPT_Wwrite_strings,
                         options::OPT_Wno_write_strings, options::OPT_w);
     if (WriteStrings &&
         WriteStrings->getOption().matches(options::OPT_Wwrite_strings))
       CmdArgs.push_back("-fconst-strings");
   }
 
   // GCC provides a macro definition '__DEPRECATED' when -Wdeprecated is active
   // during C++ compilation, which it is by default. GCC keeps this define even
   // in the presence of '-w', match this behavior bug-for-bug.
   if (types::isCXX(InputType) &&
       Args.hasFlag(options::OPT_Wdeprecated, options::OPT_Wno_deprecated,
                    true)) {
     CmdArgs.push_back("-fdeprecated-macro");
   }
 
   // Translate GCC's misnamer '-fasm' arguments to '-fgnu-keywords'.
   if (Arg *Asm = Args.getLastArg(options::OPT_fasm, options::OPT_fno_asm)) {
     if (Asm->getOption().matches(options::OPT_fasm))
       CmdArgs.push_back("-fgnu-keywords");
     else
       CmdArgs.push_back("-fno-gnu-keywords");
   }
 
   if (ShouldDisableDwarfDirectory(Args, getToolChain()))
     CmdArgs.push_back("-fno-dwarf-directory-asm");
 
   if (ShouldDisableAutolink(Args, getToolChain()))
     CmdArgs.push_back("-fno-autolink");
 
   // Add in -fdebug-compilation-dir if necessary.
   addDebugCompDirArg(Args, CmdArgs);
 
   for (const Arg *A : Args.filtered(options::OPT_fdebug_prefix_map_EQ)) {
     StringRef Map = A->getValue();
     if (Map.find('=') == StringRef::npos)
       D.Diag(diag::err_drv_invalid_argument_to_fdebug_prefix_map) << Map;
     else
       CmdArgs.push_back(Args.MakeArgString("-fdebug-prefix-map=" + Map));
     A->claim();
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftemplate_depth_,
                                options::OPT_ftemplate_depth_EQ)) {
     CmdArgs.push_back("-ftemplate-depth");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_foperator_arrow_depth_EQ)) {
     CmdArgs.push_back("-foperator-arrow-depth");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_depth_EQ)) {
     CmdArgs.push_back("-fconstexpr-depth");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_steps_EQ)) {
     CmdArgs.push_back("-fconstexpr-steps");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fbracket_depth_EQ)) {
     CmdArgs.push_back("-fbracket-depth");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_Wlarge_by_value_copy_EQ,
                                options::OPT_Wlarge_by_value_copy_def)) {
     if (A->getNumValues()) {
       StringRef bytes = A->getValue();
       CmdArgs.push_back(Args.MakeArgString("-Wlarge-by-value-copy=" + bytes));
     } else
       CmdArgs.push_back("-Wlarge-by-value-copy=64"); // default value
   }
 
   if (Args.hasArg(options::OPT_relocatable_pch))
     CmdArgs.push_back("-relocatable-pch");
 
   if (Arg *A = Args.getLastArg(options::OPT_fconstant_string_class_EQ)) {
     CmdArgs.push_back("-fconstant-string-class");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftabstop_EQ)) {
     CmdArgs.push_back("-ftabstop");
     CmdArgs.push_back(A->getValue());
   }
 
   CmdArgs.push_back("-ferror-limit");
   if (Arg *A = Args.getLastArg(options::OPT_ferror_limit_EQ))
     CmdArgs.push_back(A->getValue());
   else
     CmdArgs.push_back("19");
 
   if (Arg *A = Args.getLastArg(options::OPT_fmacro_backtrace_limit_EQ)) {
     CmdArgs.push_back("-fmacro-backtrace-limit");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_ftemplate_backtrace_limit_EQ)) {
     CmdArgs.push_back("-ftemplate-backtrace-limit");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fconstexpr_backtrace_limit_EQ)) {
     CmdArgs.push_back("-fconstexpr-backtrace-limit");
     CmdArgs.push_back(A->getValue());
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fspell_checking_limit_EQ)) {
     CmdArgs.push_back("-fspell-checking-limit");
     CmdArgs.push_back(A->getValue());
   }
 
   // Pass -fmessage-length=.
   CmdArgs.push_back("-fmessage-length");
   if (Arg *A = Args.getLastArg(options::OPT_fmessage_length_EQ)) {
     CmdArgs.push_back(A->getValue());
   } else {
     // If -fmessage-length=N was not specified, determine whether this is a
     // terminal and, if so, implicitly define -fmessage-length appropriately.
     unsigned N = llvm::sys::Process::StandardErrColumns();
     CmdArgs.push_back(Args.MakeArgString(Twine(N)));
   }
 
   // -fvisibility= and -fvisibility-ms-compat are of a piece.
   if (const Arg *A = Args.getLastArg(options::OPT_fvisibility_EQ,
                                      options::OPT_fvisibility_ms_compat)) {
     if (A->getOption().matches(options::OPT_fvisibility_EQ)) {
       CmdArgs.push_back("-fvisibility");
       CmdArgs.push_back(A->getValue());
     } else {
       assert(A->getOption().matches(options::OPT_fvisibility_ms_compat));
       CmdArgs.push_back("-fvisibility");
       CmdArgs.push_back("hidden");
       CmdArgs.push_back("-ftype-visibility");
       CmdArgs.push_back("default");
     }
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_fvisibility_inlines_hidden);
 
   Args.AddLastArg(CmdArgs, options::OPT_ftlsmodel_EQ);
 
   // Forward -f (flag) options which we can pass directly.
   Args.AddLastArg(CmdArgs, options::OPT_femit_all_decls);
   Args.AddLastArg(CmdArgs, options::OPT_fheinous_gnu_extensions);
   Args.AddLastArg(CmdArgs, options::OPT_fno_operator_names);
   // Emulated TLS is enabled by default on Android and OpenBSD, and can be enabled
   // manually with -femulated-tls.
   bool EmulatedTLSDefault = Triple.isAndroid() || Triple.isOSOpenBSD() ||
                             Triple.isWindowsCygwinEnvironment();
   if (Args.hasFlag(options::OPT_femulated_tls, options::OPT_fno_emulated_tls,
                    EmulatedTLSDefault))
     CmdArgs.push_back("-femulated-tls");
   // AltiVec-like language extensions aren't relevant for assembling.
   if (!isa<PreprocessJobAction>(JA) || Output.getType() != types::TY_PP_Asm)
     Args.AddLastArg(CmdArgs, options::OPT_fzvector);
 
   Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_show_template_tree);
   Args.AddLastArg(CmdArgs, options::OPT_fno_elide_type);
 
   // Forward flags for OpenMP. We don't do this if the current action is an
   // device offloading action other than OpenMP.
   if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
                    options::OPT_fno_openmp, false) &&
       (JA.isDeviceOffloading(Action::OFK_None) ||
        JA.isDeviceOffloading(Action::OFK_OpenMP))) {
     switch (D.getOpenMPRuntime(Args)) {
     case Driver::OMPRT_OMP:
     case Driver::OMPRT_IOMP5:
       // Clang can generate useful OpenMP code for these two runtime libraries.
       CmdArgs.push_back("-fopenmp");
 
       // If no option regarding the use of TLS in OpenMP codegeneration is
       // given, decide a default based on the target. Otherwise rely on the
       // options and pass the right information to the frontend.
       if (!Args.hasFlag(options::OPT_fopenmp_use_tls,
                         options::OPT_fnoopenmp_use_tls, /*Default=*/true))
         CmdArgs.push_back("-fnoopenmp-use-tls");
       Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ);
       break;
     default:
       // By default, if Clang doesn't know how to generate useful OpenMP code
       // for a specific runtime library, we just don't pass the '-fopenmp' flag
       // down to the actual compilation.
       // FIXME: It would be better to have a mode which *only* omits IR
       // generation based on the OpenMP support so that we get consistent
       // semantic analysis, etc.
       break;
     }
   } else {
     Args.AddLastArg(CmdArgs, options::OPT_fopenmp_simd,
                     options::OPT_fno_openmp_simd);
     Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ);
   }
 
   const SanitizerArgs &Sanitize = getToolChain().getSanitizerArgs();
   Sanitize.addArgs(getToolChain(), Args, CmdArgs, InputType);
 
   const XRayArgs &XRay = getToolChain().getXRayArgs();
   XRay.addArgs(getToolChain(), Args, CmdArgs, InputType);
 
   if (getToolChain().SupportsProfiling())
     Args.AddLastArg(CmdArgs, options::OPT_pg);
 
   if (getToolChain().SupportsProfiling())
     Args.AddLastArg(CmdArgs, options::OPT_mfentry);
 
   // -flax-vector-conversions is default.
   if (!Args.hasFlag(options::OPT_flax_vector_conversions,
                     options::OPT_fno_lax_vector_conversions))
     CmdArgs.push_back("-fno-lax-vector-conversions");
 
   if (Args.getLastArg(options::OPT_fapple_kext) ||
       (Args.hasArg(options::OPT_mkernel) && types::isCXX(InputType)))
     CmdArgs.push_back("-fapple-kext");
 
   Args.AddLastArg(CmdArgs, options::OPT_fobjc_sender_dependent_dispatch);
   Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_print_source_range_info);
   Args.AddLastArg(CmdArgs, options::OPT_fdiagnostics_parseable_fixits);
   Args.AddLastArg(CmdArgs, options::OPT_ftime_report);
   Args.AddLastArg(CmdArgs, options::OPT_ftrapv);
 
   if (Arg *A = Args.getLastArg(options::OPT_ftrapv_handler_EQ)) {
     CmdArgs.push_back("-ftrapv-handler");
     CmdArgs.push_back(A->getValue());
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_ftrap_function_EQ);
 
   // -fno-strict-overflow implies -fwrapv if it isn't disabled, but
   // -fstrict-overflow won't turn off an explicitly enabled -fwrapv.
   if (Arg *A = Args.getLastArg(options::OPT_fwrapv, options::OPT_fno_wrapv)) {
     if (A->getOption().matches(options::OPT_fwrapv))
       CmdArgs.push_back("-fwrapv");
   } else if (Arg *A = Args.getLastArg(options::OPT_fstrict_overflow,
                                       options::OPT_fno_strict_overflow)) {
     if (A->getOption().matches(options::OPT_fno_strict_overflow))
       CmdArgs.push_back("-fwrapv");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_freroll_loops,
                                options::OPT_fno_reroll_loops))
     if (A->getOption().matches(options::OPT_freroll_loops))
       CmdArgs.push_back("-freroll-loops");
 
   Args.AddLastArg(CmdArgs, options::OPT_fwritable_strings);
   Args.AddLastArg(CmdArgs, options::OPT_funroll_loops,
                   options::OPT_fno_unroll_loops);
 
   Args.AddLastArg(CmdArgs, options::OPT_pthread);
 
   RenderSSPOptions(getToolChain(), Args, CmdArgs, KernelOrKext);
 
   // Translate -mstackrealign
   if (Args.hasFlag(options::OPT_mstackrealign, options::OPT_mno_stackrealign,
                    false))
     CmdArgs.push_back(Args.MakeArgString("-mstackrealign"));
 
   if (Args.hasArg(options::OPT_mstack_alignment)) {
     StringRef alignment = Args.getLastArgValue(options::OPT_mstack_alignment);
     CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + alignment));
   }
 
   if (Args.hasArg(options::OPT_mstack_probe_size)) {
     StringRef Size = Args.getLastArgValue(options::OPT_mstack_probe_size);
 
     if (!Size.empty())
       CmdArgs.push_back(Args.MakeArgString("-mstack-probe-size=" + Size));
     else
       CmdArgs.push_back("-mstack-probe-size=0");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mrestrict_it,
                                options::OPT_mno_restrict_it)) {
     if (A->getOption().matches(options::OPT_mrestrict_it)) {
       CmdArgs.push_back("-backend-option");
       CmdArgs.push_back("-arm-restrict-it");
     } else {
       CmdArgs.push_back("-backend-option");
       CmdArgs.push_back("-arm-no-restrict-it");
     }
   } else if (Triple.isOSWindows() &&
              (Triple.getArch() == llvm::Triple::arm ||
               Triple.getArch() == llvm::Triple::thumb)) {
     // Windows on ARM expects restricted IT blocks
     CmdArgs.push_back("-backend-option");
     CmdArgs.push_back("-arm-restrict-it");
   }
 
   // Forward -cl options to -cc1
   RenderOpenCLOptions(Args, CmdArgs);
 
   // Forward -f options with positive and negative forms; we translate
   // these by hand.
   if (Arg *A = getLastProfileSampleUseArg(Args)) {
     StringRef fname = A->getValue();
     if (!llvm::sys::fs::exists(fname))
       D.Diag(diag::err_drv_no_such_file) << fname;
     else
       A->render(Args, CmdArgs);
   }
 
   RenderBuiltinOptions(getToolChain(), RawTriple, Args, CmdArgs);
 
   if (!Args.hasFlag(options::OPT_fassume_sane_operator_new,
                     options::OPT_fno_assume_sane_operator_new))
     CmdArgs.push_back("-fno-assume-sane-operator-new");
 
   // -fblocks=0 is default.
   if (Args.hasFlag(options::OPT_fblocks, options::OPT_fno_blocks,
                    getToolChain().IsBlocksDefault()) ||
       (Args.hasArg(options::OPT_fgnu_runtime) &&
        Args.hasArg(options::OPT_fobjc_nonfragile_abi) &&
        !Args.hasArg(options::OPT_fno_blocks))) {
     CmdArgs.push_back("-fblocks");
 
     if (!Args.hasArg(options::OPT_fgnu_runtime) &&
         !getToolChain().hasBlocksRuntime())
       CmdArgs.push_back("-fblocks-runtime-optional");
   }
 
   // -fencode-extended-block-signature=1 is default.
   if (getToolChain().IsEncodeExtendedBlockSignatureDefault())
     CmdArgs.push_back("-fencode-extended-block-signature");
 
   if (Args.hasFlag(options::OPT_fcoroutines_ts, options::OPT_fno_coroutines_ts,
                    false) &&
       types::isCXX(InputType)) {
     CmdArgs.push_back("-fcoroutines-ts");
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_fdouble_square_bracket_attributes,
                   options::OPT_fno_double_square_bracket_attributes);
 
   bool HaveModules = false;
   RenderModulesOptions(C, D, Args, Input, Output, CmdArgs, HaveModules);
 
   // -faccess-control is default.
   if (Args.hasFlag(options::OPT_fno_access_control,
                    options::OPT_faccess_control, false))
     CmdArgs.push_back("-fno-access-control");
 
   // -felide-constructors is the default.
   if (Args.hasFlag(options::OPT_fno_elide_constructors,
                    options::OPT_felide_constructors, false))
     CmdArgs.push_back("-fno-elide-constructors");
 
   ToolChain::RTTIMode RTTIMode = getToolChain().getRTTIMode();
 
   if (KernelOrKext || (types::isCXX(InputType) &&
                        (RTTIMode == ToolChain::RM_DisabledExplicitly ||
                         RTTIMode == ToolChain::RM_DisabledImplicitly)))
     CmdArgs.push_back("-fno-rtti");
 
   // -fshort-enums=0 is default for all architectures except Hexagon.
   if (Args.hasFlag(options::OPT_fshort_enums, options::OPT_fno_short_enums,
                    getToolChain().getArch() == llvm::Triple::hexagon))
     CmdArgs.push_back("-fshort-enums");
 
   RenderCharacterOptions(Args, AuxTriple ? *AuxTriple : RawTriple, CmdArgs);
 
   // -fuse-cxa-atexit is default.
   if (!Args.hasFlag(
           options::OPT_fuse_cxa_atexit, options::OPT_fno_use_cxa_atexit,
           !RawTriple.isOSWindows() &&
               RawTriple.getOS() != llvm::Triple::Solaris &&
               getToolChain().getArch() != llvm::Triple::hexagon &&
               getToolChain().getArch() != llvm::Triple::xcore &&
               ((RawTriple.getVendor() != llvm::Triple::MipsTechnologies) ||
                RawTriple.hasEnvironment())) ||
       KernelOrKext)
     CmdArgs.push_back("-fno-use-cxa-atexit");
 
   // -fms-extensions=0 is default.
   if (Args.hasFlag(options::OPT_fms_extensions, options::OPT_fno_ms_extensions,
                    IsWindowsMSVC))
     CmdArgs.push_back("-fms-extensions");
 
   // -fno-use-line-directives is default.
   if (Args.hasFlag(options::OPT_fuse_line_directives,
                    options::OPT_fno_use_line_directives, false))
     CmdArgs.push_back("-fuse-line-directives");
 
   // -fms-compatibility=0 is default.
   if (Args.hasFlag(options::OPT_fms_compatibility,
                    options::OPT_fno_ms_compatibility,
                    (IsWindowsMSVC &&
                     Args.hasFlag(options::OPT_fms_extensions,
                                  options::OPT_fno_ms_extensions, true))))
     CmdArgs.push_back("-fms-compatibility");
 
   VersionTuple MSVT = getToolChain().computeMSVCVersion(&D, Args);
   if (!MSVT.empty())
     CmdArgs.push_back(
         Args.MakeArgString("-fms-compatibility-version=" + MSVT.getAsString()));
 
   bool IsMSVC2015Compatible = MSVT.getMajor() >= 19;
   if (ImplyVCPPCXXVer) {
     StringRef LanguageStandard;
     if (const Arg *StdArg = Args.getLastArg(options::OPT__SLASH_std)) {
       LanguageStandard = llvm::StringSwitch<StringRef>(StdArg->getValue())
                              .Case("c++14", "-std=c++14")
                              .Case("c++17", "-std=c++17")
                              .Case("c++latest", "-std=c++2a")
                              .Default("");
       if (LanguageStandard.empty())
         D.Diag(clang::diag::warn_drv_unused_argument)
             << StdArg->getAsString(Args);
     }
 
     if (LanguageStandard.empty()) {
       if (IsMSVC2015Compatible)
         LanguageStandard = "-std=c++14";
       else
         LanguageStandard = "-std=c++11";
     }
 
     CmdArgs.push_back(LanguageStandard.data());
   }
 
   // -fno-borland-extensions is default.
   if (Args.hasFlag(options::OPT_fborland_extensions,
                    options::OPT_fno_borland_extensions, false))
     CmdArgs.push_back("-fborland-extensions");
 
   // -fno-declspec is default, except for PS4.
   if (Args.hasFlag(options::OPT_fdeclspec, options::OPT_fno_declspec,
                    RawTriple.isPS4()))
     CmdArgs.push_back("-fdeclspec");
   else if (Args.hasArg(options::OPT_fno_declspec))
     CmdArgs.push_back("-fno-declspec"); // Explicitly disabling __declspec.
 
   // -fthreadsafe-static is default, except for MSVC compatibility versions less
   // than 19.
   if (!Args.hasFlag(options::OPT_fthreadsafe_statics,
                     options::OPT_fno_threadsafe_statics,
                     !IsWindowsMSVC || IsMSVC2015Compatible))
     CmdArgs.push_back("-fno-threadsafe-statics");
 
   // -fno-delayed-template-parsing is default, except when targetting MSVC.
   // Many old Windows SDK versions require this to parse.
   // FIXME: MSVC introduced /Zc:twoPhase- to disable this behavior in their
   // compiler. We should be able to disable this by default at some point.
   if (Args.hasFlag(options::OPT_fdelayed_template_parsing,
                    options::OPT_fno_delayed_template_parsing, IsWindowsMSVC))
     CmdArgs.push_back("-fdelayed-template-parsing");
 
   // -fgnu-keywords default varies depending on language; only pass if
   // specified.
   if (Arg *A = Args.getLastArg(options::OPT_fgnu_keywords,
                                options::OPT_fno_gnu_keywords))
     A->render(Args, CmdArgs);
 
   if (Args.hasFlag(options::OPT_fgnu89_inline, options::OPT_fno_gnu89_inline,
                    false))
     CmdArgs.push_back("-fgnu89-inline");
 
   if (Args.hasArg(options::OPT_fno_inline))
     CmdArgs.push_back("-fno-inline");
 
   if (Arg* InlineArg = Args.getLastArg(options::OPT_finline_functions,
                                        options::OPT_finline_hint_functions,
                                        options::OPT_fno_inline_functions))
     InlineArg->render(Args, CmdArgs);
 
   Args.AddLastArg(CmdArgs, options::OPT_fexperimental_new_pass_manager,
                   options::OPT_fno_experimental_new_pass_manager);
 
   ObjCRuntime Runtime = AddObjCRuntimeArgs(Args, CmdArgs, rewriteKind);
   RenderObjCOptions(getToolChain(), D, RawTriple, Args, Runtime,
                     rewriteKind != RK_None, Input, CmdArgs);
 
   if (Args.hasFlag(options::OPT_fapplication_extension,
                    options::OPT_fno_application_extension, false))
     CmdArgs.push_back("-fapplication-extension");
 
   // Handle GCC-style exception args.
   if (!C.getDriver().IsCLMode())
     addExceptionArgs(Args, InputType, getToolChain(), KernelOrKext, Runtime,
                      CmdArgs);
 
   // Handle exception personalities
   Arg *A = Args.getLastArg(options::OPT_fsjlj_exceptions,
                            options::OPT_fseh_exceptions,
                            options::OPT_fdwarf_exceptions);
   if (A) {
     const Option &Opt = A->getOption();
     if (Opt.matches(options::OPT_fsjlj_exceptions))
       CmdArgs.push_back("-fsjlj-exceptions");
     if (Opt.matches(options::OPT_fseh_exceptions))
       CmdArgs.push_back("-fseh-exceptions");
     if (Opt.matches(options::OPT_fdwarf_exceptions))
       CmdArgs.push_back("-fdwarf-exceptions");
   } else {
     switch (getToolChain().GetExceptionModel(Args)) {
     default:
       break;
     case llvm::ExceptionHandling::DwarfCFI:
       CmdArgs.push_back("-fdwarf-exceptions");
       break;
     case llvm::ExceptionHandling::SjLj:
       CmdArgs.push_back("-fsjlj-exceptions");
       break;
     case llvm::ExceptionHandling::WinEH:
       CmdArgs.push_back("-fseh-exceptions");
       break;
     }
   }
 
   // C++ "sane" operator new.
   if (!Args.hasFlag(options::OPT_fassume_sane_operator_new,
                     options::OPT_fno_assume_sane_operator_new))
     CmdArgs.push_back("-fno-assume-sane-operator-new");
 
   // -frelaxed-template-template-args is off by default, as it is a severe
   // breaking change until a corresponding change to template partial ordering
   // is provided.
   if (Args.hasFlag(options::OPT_frelaxed_template_template_args,
                    options::OPT_fno_relaxed_template_template_args, false))
     CmdArgs.push_back("-frelaxed-template-template-args");
 
   // -fsized-deallocation is off by default, as it is an ABI-breaking change for
   // most platforms.
   if (Args.hasFlag(options::OPT_fsized_deallocation,
                    options::OPT_fno_sized_deallocation, false))
     CmdArgs.push_back("-fsized-deallocation");
 
   // -faligned-allocation is on by default in C++17 onwards and otherwise off
   // by default.
   if (Arg *A = Args.getLastArg(options::OPT_faligned_allocation,
                                options::OPT_fno_aligned_allocation,
                                options::OPT_faligned_new_EQ)) {
     if (A->getOption().matches(options::OPT_fno_aligned_allocation))
       CmdArgs.push_back("-fno-aligned-allocation");
     else
       CmdArgs.push_back("-faligned-allocation");
   }
 
   // The default new alignment can be specified using a dedicated option or via
   // a GCC-compatible option that also turns on aligned allocation.
   if (Arg *A = Args.getLastArg(options::OPT_fnew_alignment_EQ,
                                options::OPT_faligned_new_EQ))
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-fnew-alignment=") + A->getValue()));
 
   // -fconstant-cfstrings is default, and may be subject to argument translation
   // on Darwin.
   if (!Args.hasFlag(options::OPT_fconstant_cfstrings,
                     options::OPT_fno_constant_cfstrings) ||
       !Args.hasFlag(options::OPT_mconstant_cfstrings,
                     options::OPT_mno_constant_cfstrings))
     CmdArgs.push_back("-fno-constant-cfstrings");
 
   // -fno-pascal-strings is default, only pass non-default.
   if (Args.hasFlag(options::OPT_fpascal_strings,
                    options::OPT_fno_pascal_strings, false))
     CmdArgs.push_back("-fpascal-strings");
 
   // Honor -fpack-struct= and -fpack-struct, if given. Note that
   // -fno-pack-struct doesn't apply to -fpack-struct=.
   if (Arg *A = Args.getLastArg(options::OPT_fpack_struct_EQ)) {
     std::string PackStructStr = "-fpack-struct=";
     PackStructStr += A->getValue();
     CmdArgs.push_back(Args.MakeArgString(PackStructStr));
   } else if (Args.hasFlag(options::OPT_fpack_struct,
                           options::OPT_fno_pack_struct, false)) {
     CmdArgs.push_back("-fpack-struct=1");
   }
 
   // Handle -fmax-type-align=N and -fno-type-align
   bool SkipMaxTypeAlign = Args.hasArg(options::OPT_fno_max_type_align);
   if (Arg *A = Args.getLastArg(options::OPT_fmax_type_align_EQ)) {
     if (!SkipMaxTypeAlign) {
       std::string MaxTypeAlignStr = "-fmax-type-align=";
       MaxTypeAlignStr += A->getValue();
       CmdArgs.push_back(Args.MakeArgString(MaxTypeAlignStr));
     }
   } else if (RawTriple.isOSDarwin()) {
     if (!SkipMaxTypeAlign) {
       std::string MaxTypeAlignStr = "-fmax-type-align=16";
       CmdArgs.push_back(Args.MakeArgString(MaxTypeAlignStr));
     }
   }
 
   // -fcommon is the default unless compiling kernel code or the target says so
   bool NoCommonDefault = KernelOrKext || isNoCommonDefault(RawTriple);
   if (!Args.hasFlag(options::OPT_fcommon, options::OPT_fno_common,
                     !NoCommonDefault))
     CmdArgs.push_back("-fno-common");
 
   // -fsigned-bitfields is default, and clang doesn't yet support
   // -funsigned-bitfields.
   if (!Args.hasFlag(options::OPT_fsigned_bitfields,
                     options::OPT_funsigned_bitfields))
     D.Diag(diag::warn_drv_clang_unsupported)
         << Args.getLastArg(options::OPT_funsigned_bitfields)->getAsString(Args);
 
   // -fsigned-bitfields is default, and clang doesn't support -fno-for-scope.
   if (!Args.hasFlag(options::OPT_ffor_scope, options::OPT_fno_for_scope))
     D.Diag(diag::err_drv_clang_unsupported)
         << Args.getLastArg(options::OPT_fno_for_scope)->getAsString(Args);
 
   // -finput_charset=UTF-8 is default. Reject others
   if (Arg *inputCharset = Args.getLastArg(options::OPT_finput_charset_EQ)) {
     StringRef value = inputCharset->getValue();
     if (!value.equals_lower("utf-8"))
       D.Diag(diag::err_drv_invalid_value) << inputCharset->getAsString(Args)
                                           << value;
   }
 
   // -fexec_charset=UTF-8 is default. Reject others
   if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
     StringRef value = execCharset->getValue();
     if (!value.equals_lower("utf-8"))
       D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
                                           << value;
   }
 
   RenderDiagnosticsOptions(D, Args, CmdArgs);
 
   // -fno-asm-blocks is default.
   if (Args.hasFlag(options::OPT_fasm_blocks, options::OPT_fno_asm_blocks,
                    false))
     CmdArgs.push_back("-fasm-blocks");
 
   // -fgnu-inline-asm is default.
   if (!Args.hasFlag(options::OPT_fgnu_inline_asm,
                     options::OPT_fno_gnu_inline_asm, true))
     CmdArgs.push_back("-fno-gnu-inline-asm");
 
   // Enable vectorization per default according to the optimization level
   // selected. For optimization levels that want vectorization we use the alias
   // option to simplify the hasFlag logic.
   bool EnableVec = shouldEnableVectorizerAtOLevel(Args, false);
   OptSpecifier VectorizeAliasOption =
       EnableVec ? options::OPT_O_Group : options::OPT_fvectorize;
   if (Args.hasFlag(options::OPT_fvectorize, VectorizeAliasOption,
                    options::OPT_fno_vectorize, EnableVec))
     CmdArgs.push_back("-vectorize-loops");
 
   // -fslp-vectorize is enabled based on the optimization level selected.
   bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true);
   OptSpecifier SLPVectAliasOption =
       EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize;
   if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption,
                    options::OPT_fno_slp_vectorize, EnableSLPVec))
     CmdArgs.push_back("-vectorize-slp");
 
   ParseMPreferVectorWidth(D, Args, CmdArgs);
 
   if (Arg *A = Args.getLastArg(options::OPT_fshow_overloads_EQ))
     A->render(Args, CmdArgs);
 
   if (Arg *A = Args.getLastArg(
           options::OPT_fsanitize_undefined_strip_path_components_EQ))
     A->render(Args, CmdArgs);
 
   // -fdollars-in-identifiers default varies depending on platform and
   // language; only pass if specified.
   if (Arg *A = Args.getLastArg(options::OPT_fdollars_in_identifiers,
                                options::OPT_fno_dollars_in_identifiers)) {
     if (A->getOption().matches(options::OPT_fdollars_in_identifiers))
       CmdArgs.push_back("-fdollars-in-identifiers");
     else
       CmdArgs.push_back("-fno-dollars-in-identifiers");
   }
 
   // -funit-at-a-time is default, and we don't support -fno-unit-at-a-time for
   // practical purposes.
   if (Arg *A = Args.getLastArg(options::OPT_funit_at_a_time,
                                options::OPT_fno_unit_at_a_time)) {
     if (A->getOption().matches(options::OPT_fno_unit_at_a_time))
       D.Diag(diag::warn_drv_clang_unsupported) << A->getAsString(Args);
   }
 
   if (Args.hasFlag(options::OPT_fapple_pragma_pack,
                    options::OPT_fno_apple_pragma_pack, false))
     CmdArgs.push_back("-fapple-pragma-pack");
 
   if (Args.hasFlag(options::OPT_fsave_optimization_record,
                    options::OPT_foptimization_record_file_EQ,
                    options::OPT_fno_save_optimization_record, false)) {
     CmdArgs.push_back("-opt-record-file");
 
     const Arg *A = Args.getLastArg(options::OPT_foptimization_record_file_EQ);
     if (A) {
       CmdArgs.push_back(A->getValue());
     } else {
       SmallString<128> F;
 
       if (Args.hasArg(options::OPT_c) || Args.hasArg(options::OPT_S)) {
         if (Arg *FinalOutput = Args.getLastArg(options::OPT_o))
           F = FinalOutput->getValue();
       }
 
       if (F.empty()) {
         // Use the input filename.
         F = llvm::sys::path::stem(Input.getBaseInput());
 
         // If we're compiling for an offload architecture (i.e. a CUDA device),
         // we need to make the file name for the device compilation different
         // from the host compilation.
         if (!JA.isDeviceOffloading(Action::OFK_None) &&
             !JA.isDeviceOffloading(Action::OFK_Host)) {
           llvm::sys::path::replace_extension(F, "");
           F += Action::GetOffloadingFileNamePrefix(JA.getOffloadingDeviceKind(),
                                                    Triple.normalize());
           F += "-";
           F += JA.getOffloadingArch();
         }
       }
 
       llvm::sys::path::replace_extension(F, "opt.yaml");
       CmdArgs.push_back(Args.MakeArgString(F));
     }
   }
 
   bool RewriteImports = Args.hasFlag(options::OPT_frewrite_imports,
                                      options::OPT_fno_rewrite_imports, false);
   if (RewriteImports)
     CmdArgs.push_back("-frewrite-imports");
 
   // Enable rewrite includes if the user's asked for it or if we're generating
   // diagnostics.
   // TODO: Once -module-dependency-dir works with -frewrite-includes it'd be
   // nice to enable this when doing a crashdump for modules as well.
   if (Args.hasFlag(options::OPT_frewrite_includes,
                    options::OPT_fno_rewrite_includes, false) ||
       (C.isForDiagnostics() && (RewriteImports || !HaveModules)))
     CmdArgs.push_back("-frewrite-includes");
 
   // Only allow -traditional or -traditional-cpp outside in preprocessing modes.
   if (Arg *A = Args.getLastArg(options::OPT_traditional,
                                options::OPT_traditional_cpp)) {
     if (isa<PreprocessJobAction>(JA))
       CmdArgs.push_back("-traditional-cpp");
     else
       D.Diag(diag::err_drv_clang_unsupported) << A->getAsString(Args);
   }
 
   Args.AddLastArg(CmdArgs, options::OPT_dM);
   Args.AddLastArg(CmdArgs, options::OPT_dD);
 
   // Handle serialized diagnostics.
   if (Arg *A = Args.getLastArg(options::OPT__serialize_diags)) {
     CmdArgs.push_back("-serialize-diagnostic-file");
     CmdArgs.push_back(Args.MakeArgString(A->getValue()));
   }
 
   if (Args.hasArg(options::OPT_fretain_comments_from_system_headers))
     CmdArgs.push_back("-fretain-comments-from-system-headers");
 
   // Forward -fcomment-block-commands to -cc1.
   Args.AddAllArgs(CmdArgs, options::OPT_fcomment_block_commands);
   // Forward -fparse-all-comments to -cc1.
   Args.AddAllArgs(CmdArgs, options::OPT_fparse_all_comments);
 
   // Turn -fplugin=name.so into -load name.so
   for (const Arg *A : Args.filtered(options::OPT_fplugin_EQ)) {
     CmdArgs.push_back("-load");
     CmdArgs.push_back(A->getValue());
     A->claim();
   }
 
   // Setup statistics file output.
   if (const Arg *A = Args.getLastArg(options::OPT_save_stats_EQ)) {
     StringRef SaveStats = A->getValue();
 
     SmallString<128> StatsFile;
     bool DoSaveStats = false;
     if (SaveStats == "obj") {
       if (Output.isFilename()) {
         StatsFile.assign(Output.getFilename());
         llvm::sys::path::remove_filename(StatsFile);
       }
       DoSaveStats = true;
     } else if (SaveStats == "cwd") {
       DoSaveStats = true;
     } else {
       D.Diag(diag::err_drv_invalid_value) << A->getAsString(Args) << SaveStats;
     }
 
     if (DoSaveStats) {
       StringRef BaseName = llvm::sys::path::filename(Input.getBaseInput());
       llvm::sys::path::append(StatsFile, BaseName);
       llvm::sys::path::replace_extension(StatsFile, "stats");
       CmdArgs.push_back(Args.MakeArgString(Twine("-stats-file=") +
                                            StatsFile));
     }
   }
 
   // Forward -Xclang arguments to -cc1, and -mllvm arguments to the LLVM option
   // parser.
   // -finclude-default-header flag is for preprocessor,
   // do not pass it to other cc1 commands when save-temps is enabled
   if (C.getDriver().isSaveTempsEnabled() &&
       !isa<PreprocessJobAction>(JA)) {
     for (auto Arg : Args.filtered(options::OPT_Xclang)) {
       Arg->claim();
       if (StringRef(Arg->getValue()) != "-finclude-default-header")
         CmdArgs.push_back(Arg->getValue());
     }
   }
   else {
     Args.AddAllArgValues(CmdArgs, options::OPT_Xclang);
   }
   for (const Arg *A : Args.filtered(options::OPT_mllvm)) {
     A->claim();
 
     // We translate this by hand to the -cc1 argument, since nightly test uses
     // it and developers have been trained to spell it with -mllvm. Both
     // spellings are now deprecated and should be removed.
     if (StringRef(A->getValue(0)) == "-disable-llvm-optzns") {
       CmdArgs.push_back("-disable-llvm-optzns");
     } else {
       A->render(Args, CmdArgs);
     }
   }
 
   // With -save-temps, we want to save the unoptimized bitcode output from the
   // CompileJobAction, use -disable-llvm-passes to get pristine IR generated
   // by the frontend.
   // When -fembed-bitcode is enabled, optimized bitcode is emitted because it
   // has slightly different breakdown between stages.
   // FIXME: -fembed-bitcode -save-temps will save optimized bitcode instead of
   // pristine IR generated by the frontend. Ideally, a new compile action should
   // be added so both IR can be captured.
   if (C.getDriver().isSaveTempsEnabled() &&
       !(C.getDriver().embedBitcodeInObject() && !C.getDriver().isUsingLTO()) &&
       isa<CompileJobAction>(JA))
     CmdArgs.push_back("-disable-llvm-passes");
 
   if (Output.getType() == types::TY_Dependencies) {
     // Handled with other dependency code.
   } else if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Invalid output.");
   }
 
   addDashXForInput(Args, Input, CmdArgs);
 
   if (Input.isFilename())
     CmdArgs.push_back(Input.getFilename());
   else
     Input.getInputArg().renderAsInput(Args, CmdArgs);
 
   Args.AddAllArgs(CmdArgs, options::OPT_undef);
 
   const char *Exec = D.getClangProgramPath();
 
   // Optionally embed the -cc1 level arguments into the debug info, for build
   // analysis.
   // Also record command line arguments into the debug info if
   // -grecord-gcc-switches options is set on.
   // By default, -gno-record-gcc-switches is set on and no recording.
   if (getToolChain().UseDwarfDebugFlags() ||
       Args.hasFlag(options::OPT_grecord_gcc_switches,
                    options::OPT_gno_record_gcc_switches, false)) {
     ArgStringList OriginalArgs;
     for (const auto &Arg : Args)
       Arg->render(Args, OriginalArgs);
 
     SmallString<256> Flags;
     Flags += Exec;
     for (const char *OriginalArg : OriginalArgs) {
       SmallString<128> EscapedArg;
       EscapeSpacesAndBackslashes(OriginalArg, EscapedArg);
       Flags += " ";
       Flags += EscapedArg;
     }
     CmdArgs.push_back("-dwarf-debug-flags");
     CmdArgs.push_back(Args.MakeArgString(Flags));
   }
 
   // Host-side cuda compilation receives device-side outputs as Inputs[1...].
   // Include them with -fcuda-include-gpubinary.
   if (IsCuda && Inputs.size() > 1)
     for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
       CmdArgs.push_back("-fcuda-include-gpubinary");
       CmdArgs.push_back(I->getFilename());
     }
 
   // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
   // to specify the result of the compile phase on the host, so the meaningful
   // device declarations can be identified. Also, -fopenmp-is-device is passed
   // along to tell the frontend that it is generating code for a device, so that
   // only the relevant declarations are emitted.
   if (IsOpenMPDevice) {
     CmdArgs.push_back("-fopenmp-is-device");
     if (Inputs.size() == 2) {
       CmdArgs.push_back("-fopenmp-host-ir-file-path");
       CmdArgs.push_back(Args.MakeArgString(Inputs.back().getFilename()));
     }
   }
 
   // For all the host OpenMP offloading compile jobs we need to pass the targets
   // information using -fopenmp-targets= option.
   if (isa<CompileJobAction>(JA) && JA.isHostOffloading(Action::OFK_OpenMP)) {
     SmallString<128> TargetInfo("-fopenmp-targets=");
 
     Arg *Tgts = Args.getLastArg(options::OPT_fopenmp_targets_EQ);
     assert(Tgts && Tgts->getNumValues() &&
            "OpenMP offloading has to have targets specified.");
     for (unsigned i = 0; i < Tgts->getNumValues(); ++i) {
       if (i)
         TargetInfo += ',';
       // We need to get the string from the triple because it may be not exactly
       // the same as the one we get directly from the arguments.
       llvm::Triple T(Tgts->getValue(i));
       TargetInfo += T.getTriple();
     }
     CmdArgs.push_back(Args.MakeArgString(TargetInfo.str()));
   }
 
   bool WholeProgramVTables =
       Args.hasFlag(options::OPT_fwhole_program_vtables,
                    options::OPT_fno_whole_program_vtables, false);
   if (WholeProgramVTables) {
     if (!D.isUsingLTO())
       D.Diag(diag::err_drv_argument_only_allowed_with)
           << "-fwhole-program-vtables"
           << "-flto";
     CmdArgs.push_back("-fwhole-program-vtables");
   }
 
+  if (Arg *A = Args.getLastArg(options::OPT_fexperimental_isel,
+                               options::OPT_fno_experimental_isel)) {
+    CmdArgs.push_back("-mllvm");
+    if (A->getOption().matches(options::OPT_fexperimental_isel)) {
+      CmdArgs.push_back("-global-isel=1");
+
+      // GISel is on by default on AArch64 -O0, so don't bother adding
+      // the fallback remarks for it. Other combinations will add a warning of
+      // some kind.
+      bool IsArchSupported = Triple.getArch() == llvm::Triple::aarch64;
+      bool IsOptLevelSupported = false;
+
+      Arg *A = Args.getLastArg(options::OPT_O_Group);
+      if (Triple.getArch() == llvm::Triple::aarch64) {
+        if (!A || A->getOption().matches(options::OPT_O0))
+          IsOptLevelSupported = true;
+      }
+      if (!IsArchSupported || !IsOptLevelSupported) {
+        CmdArgs.push_back("-mllvm");
+        CmdArgs.push_back("-global-isel-abort=2");
+
+        if (!IsArchSupported)
+          D.Diag(diag::warn_drv_experimental_isel_incomplete) << Triple.getArchName();
+        else
+          D.Diag(diag::warn_drv_experimental_isel_incomplete_opt);
+      }
+    } else {
+      CmdArgs.push_back("-global-isel=0");
+    }
+  }
+
   // Finally add the compile command to the compilation.
   if (Args.hasArg(options::OPT__SLASH_fallback) &&
       Output.getType() == types::TY_Object &&
       (InputType == types::TY_C || InputType == types::TY_CXX)) {
     auto CLCommand =
         getCLFallback()->GetCommand(C, JA, Output, Inputs, Args, LinkingOutput);
     C.addCommand(llvm::make_unique<FallbackCommand>(
         JA, *this, Exec, CmdArgs, Inputs, std::move(CLCommand)));
   } else if (Args.hasArg(options::OPT__SLASH_fallback) &&
              isa<PrecompileJobAction>(JA)) {
     // In /fallback builds, run the main compilation even if the pch generation
     // fails, so that the main compilation's fallback to cl.exe runs.
     C.addCommand(llvm::make_unique<ForceSuccessCommand>(JA, *this, Exec,
                                                         CmdArgs, Inputs));
   } else {
     C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
   }
 
   // Handle the debug info splitting at object creation time if we're
   // creating an object.
   // TODO: Currently only works on linux with newer objcopy.
   if (SplitDWARF && Output.getType() == types::TY_Object)
     SplitDebugInfo(getToolChain(), C, *this, JA, Args, Output, SplitDWARFOut);
 
   if (Arg *A = Args.getLastArg(options::OPT_pg))
     if (Args.hasArg(options::OPT_fomit_frame_pointer))
       D.Diag(diag::err_drv_argument_not_allowed_with) << "-fomit-frame-pointer"
                                                       << A->getAsString(Args);
 
   // Claim some arguments which clang supports automatically.
 
   // -fpch-preprocess is used with gcc to add a special marker in the output to
   // include the PCH file. Clang's PTH solution is completely transparent, so we
   // do not need to deal with it at all.
   Args.ClaimAllArgs(options::OPT_fpch_preprocess);
 
   // Claim some arguments which clang doesn't support, but we don't
   // care to warn the user about.
   Args.ClaimAllArgs(options::OPT_clang_ignored_f_Group);
   Args.ClaimAllArgs(options::OPT_clang_ignored_m_Group);
 
   // Disable warnings for clang -E -emit-llvm foo.c
   Args.ClaimAllArgs(options::OPT_emit_llvm);
 }
 
 Clang::Clang(const ToolChain &TC)
     // CAUTION! The first constructor argument ("clang") is not arbitrary,
     // as it is for other tools. Some operations on a Tool actually test
     // whether that tool is Clang based on the Tool's Name as a string.
     : Tool("clang", "clang frontend", TC, RF_Full) {}
 
 Clang::~Clang() {}
 
 /// Add options related to the Objective-C runtime/ABI.
 ///
 /// Returns true if the runtime is non-fragile.
 ObjCRuntime Clang::AddObjCRuntimeArgs(const ArgList &args,
                                       ArgStringList &cmdArgs,
                                       RewriteKind rewriteKind) const {
   // Look for the controlling runtime option.
   Arg *runtimeArg =
       args.getLastArg(options::OPT_fnext_runtime, options::OPT_fgnu_runtime,
                       options::OPT_fobjc_runtime_EQ);
 
   // Just forward -fobjc-runtime= to the frontend.  This supercedes
   // options about fragility.
   if (runtimeArg &&
       runtimeArg->getOption().matches(options::OPT_fobjc_runtime_EQ)) {
     ObjCRuntime runtime;
     StringRef value = runtimeArg->getValue();
     if (runtime.tryParse(value)) {
       getToolChain().getDriver().Diag(diag::err_drv_unknown_objc_runtime)
           << value;
     }
 
     runtimeArg->render(args, cmdArgs);
     return runtime;
   }
 
   // Otherwise, we'll need the ABI "version".  Version numbers are
   // slightly confusing for historical reasons:
   //   1 - Traditional "fragile" ABI
   //   2 - Non-fragile ABI, version 1
   //   3 - Non-fragile ABI, version 2
   unsigned objcABIVersion = 1;
   // If -fobjc-abi-version= is present, use that to set the version.
   if (Arg *abiArg = args.getLastArg(options::OPT_fobjc_abi_version_EQ)) {
     StringRef value = abiArg->getValue();
     if (value == "1")
       objcABIVersion = 1;
     else if (value == "2")
       objcABIVersion = 2;
     else if (value == "3")
       objcABIVersion = 3;
     else
       getToolChain().getDriver().Diag(diag::err_drv_clang_unsupported) << value;
   } else {
     // Otherwise, determine if we are using the non-fragile ABI.
     bool nonFragileABIIsDefault =
         (rewriteKind == RK_NonFragile ||
          (rewriteKind == RK_None &&
           getToolChain().IsObjCNonFragileABIDefault()));
     if (args.hasFlag(options::OPT_fobjc_nonfragile_abi,
                      options::OPT_fno_objc_nonfragile_abi,
                      nonFragileABIIsDefault)) {
 // Determine the non-fragile ABI version to use.
 #ifdef DISABLE_DEFAULT_NONFRAGILEABI_TWO
       unsigned nonFragileABIVersion = 1;
 #else
       unsigned nonFragileABIVersion = 2;
 #endif
 
       if (Arg *abiArg =
               args.getLastArg(options::OPT_fobjc_nonfragile_abi_version_EQ)) {
         StringRef value = abiArg->getValue();
         if (value == "1")
           nonFragileABIVersion = 1;
         else if (value == "2")
           nonFragileABIVersion = 2;
         else
           getToolChain().getDriver().Diag(diag::err_drv_clang_unsupported)
               << value;
       }
 
       objcABIVersion = 1 + nonFragileABIVersion;
     } else {
       objcABIVersion = 1;
     }
   }
 
   // We don't actually care about the ABI version other than whether
   // it's non-fragile.
   bool isNonFragile = objcABIVersion != 1;
 
   // If we have no runtime argument, ask the toolchain for its default runtime.
   // However, the rewriter only really supports the Mac runtime, so assume that.
   ObjCRuntime runtime;
   if (!runtimeArg) {
     switch (rewriteKind) {
     case RK_None:
       runtime = getToolChain().getDefaultObjCRuntime(isNonFragile);
       break;
     case RK_Fragile:
       runtime = ObjCRuntime(ObjCRuntime::FragileMacOSX, VersionTuple());
       break;
     case RK_NonFragile:
       runtime = ObjCRuntime(ObjCRuntime::MacOSX, VersionTuple());
       break;
     }
 
     // -fnext-runtime
   } else if (runtimeArg->getOption().matches(options::OPT_fnext_runtime)) {
     // On Darwin, make this use the default behavior for the toolchain.
     if (getToolChain().getTriple().isOSDarwin()) {
       runtime = getToolChain().getDefaultObjCRuntime(isNonFragile);
 
       // Otherwise, build for a generic macosx port.
     } else {
       runtime = ObjCRuntime(ObjCRuntime::MacOSX, VersionTuple());
     }
 
     // -fgnu-runtime
   } else {
     assert(runtimeArg->getOption().matches(options::OPT_fgnu_runtime));
     // Legacy behaviour is to target the gnustep runtime if we are in
     // non-fragile mode or the GCC runtime in fragile mode.
     if (isNonFragile)
       runtime = ObjCRuntime(ObjCRuntime::GNUstep, VersionTuple(1, 6));
     else
       runtime = ObjCRuntime(ObjCRuntime::GCC, VersionTuple());
   }
 
   cmdArgs.push_back(
       args.MakeArgString("-fobjc-runtime=" + runtime.getAsString()));
   return runtime;
 }
 
 static bool maybeConsumeDash(const std::string &EH, size_t &I) {
   bool HaveDash = (I + 1 < EH.size() && EH[I + 1] == '-');
   I += HaveDash;
   return !HaveDash;
 }
 
 namespace {
 struct EHFlags {
   bool Synch = false;
   bool Asynch = false;
   bool NoUnwindC = false;
 };
 } // end anonymous namespace
 
 /// /EH controls whether to run destructor cleanups when exceptions are
 /// thrown.  There are three modifiers:
 /// - s: Cleanup after "synchronous" exceptions, aka C++ exceptions.
 /// - a: Cleanup after "asynchronous" exceptions, aka structured exceptions.
 ///      The 'a' modifier is unimplemented and fundamentally hard in LLVM IR.
 /// - c: Assume that extern "C" functions are implicitly nounwind.
 /// The default is /EHs-c-, meaning cleanups are disabled.
 static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) {
   EHFlags EH;
 
   std::vector<std::string> EHArgs =
       Args.getAllArgValues(options::OPT__SLASH_EH);
   for (auto EHVal : EHArgs) {
     for (size_t I = 0, E = EHVal.size(); I != E; ++I) {
       switch (EHVal[I]) {
       case 'a':
         EH.Asynch = maybeConsumeDash(EHVal, I);
         if (EH.Asynch)
           EH.Synch = false;
         continue;
       case 'c':
         EH.NoUnwindC = maybeConsumeDash(EHVal, I);
         continue;
       case 's':
         EH.Synch = maybeConsumeDash(EHVal, I);
         if (EH.Synch)
           EH.Asynch = false;
         continue;
       default:
         break;
       }
       D.Diag(clang::diag::err_drv_invalid_value) << "/EH" << EHVal;
       break;
     }
   }
   // The /GX, /GX- flags are only processed if there are not /EH flags.
   // The default is that /GX is not specified.
   if (EHArgs.empty() &&
       Args.hasFlag(options::OPT__SLASH_GX, options::OPT__SLASH_GX_,
                    /*default=*/false)) {
     EH.Synch = true;
     EH.NoUnwindC = true;
   }
 
   return EH;
 }
 
 void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
                            ArgStringList &CmdArgs,
                            codegenoptions::DebugInfoKind *DebugInfoKind,
                            bool *EmitCodeView) const {
   unsigned RTOptionID = options::OPT__SLASH_MT;
 
   if (Args.hasArg(options::OPT__SLASH_LDd))
     // The /LDd option implies /MTd. The dependent lib part can be overridden,
     // but defining _DEBUG is sticky.
     RTOptionID = options::OPT__SLASH_MTd;
 
   if (Arg *A = Args.getLastArg(options::OPT__SLASH_M_Group))
     RTOptionID = A->getOption().getID();
 
   StringRef FlagForCRT;
   switch (RTOptionID) {
   case options::OPT__SLASH_MD:
     if (Args.hasArg(options::OPT__SLASH_LDd))
       CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-D_DLL");
     FlagForCRT = "--dependent-lib=msvcrt";
     break;
   case options::OPT__SLASH_MDd:
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-D_DLL");
     FlagForCRT = "--dependent-lib=msvcrtd";
     break;
   case options::OPT__SLASH_MT:
     if (Args.hasArg(options::OPT__SLASH_LDd))
       CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-flto-visibility-public-std");
     FlagForCRT = "--dependent-lib=libcmt";
     break;
   case options::OPT__SLASH_MTd:
     CmdArgs.push_back("-D_DEBUG");
     CmdArgs.push_back("-D_MT");
     CmdArgs.push_back("-flto-visibility-public-std");
     FlagForCRT = "--dependent-lib=libcmtd";
     break;
   default:
     llvm_unreachable("Unexpected option ID.");
   }
 
   if (Args.hasArg(options::OPT__SLASH_Zl)) {
     CmdArgs.push_back("-D_VC_NODEFAULTLIB");
   } else {
     CmdArgs.push_back(FlagForCRT.data());
 
     // This provides POSIX compatibility (maps 'open' to '_open'), which most
     // users want.  The /Za flag to cl.exe turns this off, but it's not
     // implemented in clang.
     CmdArgs.push_back("--dependent-lib=oldnames");
   }
 
   // Both /showIncludes and /E (and /EP) write to stdout. Allowing both
   // would produce interleaved output, so ignore /showIncludes in such cases.
   if ((!Args.hasArg(options::OPT_E) && !Args.hasArg(options::OPT__SLASH_EP)) ||
       (Args.hasArg(options::OPT__SLASH_P) &&
        Args.hasArg(options::OPT__SLASH_EP) && !Args.hasArg(options::OPT_E)))
     if (Arg *A = Args.getLastArg(options::OPT_show_includes))
       A->render(Args, CmdArgs);
 
   // This controls whether or not we emit RTTI data for polymorphic types.
   if (Args.hasFlag(options::OPT__SLASH_GR_, options::OPT__SLASH_GR,
                    /*default=*/false))
     CmdArgs.push_back("-fno-rtti-data");
 
   // This controls whether or not we emit stack-protector instrumentation.
   // In MSVC, Buffer Security Check (/GS) is on by default.
   if (Args.hasFlag(options::OPT__SLASH_GS, options::OPT__SLASH_GS_,
                    /*default=*/true)) {
     CmdArgs.push_back("-stack-protector");
     CmdArgs.push_back(Args.MakeArgString(Twine(LangOptions::SSPStrong)));
   }
 
   // Emit CodeView if -Z7, -Zd, or -gline-tables-only are present.
   if (Arg *DebugInfoArg =
           Args.getLastArg(options::OPT__SLASH_Z7, options::OPT__SLASH_Zd,
                           options::OPT_gline_tables_only)) {
     *EmitCodeView = true;
     if (DebugInfoArg->getOption().matches(options::OPT__SLASH_Z7))
       *DebugInfoKind = codegenoptions::LimitedDebugInfo;
     else
       *DebugInfoKind = codegenoptions::DebugLineTablesOnly;
     CmdArgs.push_back("-gcodeview");
   } else {
     *EmitCodeView = false;
   }
 
   const Driver &D = getToolChain().getDriver();
   EHFlags EH = parseClangCLEHFlags(D, Args);
   if (EH.Synch || EH.Asynch) {
     if (types::isCXX(InputType))
       CmdArgs.push_back("-fcxx-exceptions");
     CmdArgs.push_back("-fexceptions");
   }
   if (types::isCXX(InputType) && EH.Synch && EH.NoUnwindC)
     CmdArgs.push_back("-fexternc-nounwind");
 
   // /EP should expand to -E -P.
   if (Args.hasArg(options::OPT__SLASH_EP)) {
     CmdArgs.push_back("-E");
     CmdArgs.push_back("-P");
   }
 
   unsigned VolatileOptionID;
   if (getToolChain().getArch() == llvm::Triple::x86_64 ||
       getToolChain().getArch() == llvm::Triple::x86)
     VolatileOptionID = options::OPT__SLASH_volatile_ms;
   else
     VolatileOptionID = options::OPT__SLASH_volatile_iso;
 
   if (Arg *A = Args.getLastArg(options::OPT__SLASH_volatile_Group))
     VolatileOptionID = A->getOption().getID();
 
   if (VolatileOptionID == options::OPT__SLASH_volatile_ms)
     CmdArgs.push_back("-fms-volatile");
 
   Arg *MostGeneralArg = Args.getLastArg(options::OPT__SLASH_vmg);
   Arg *BestCaseArg = Args.getLastArg(options::OPT__SLASH_vmb);
   if (MostGeneralArg && BestCaseArg)
     D.Diag(clang::diag::err_drv_argument_not_allowed_with)
         << MostGeneralArg->getAsString(Args) << BestCaseArg->getAsString(Args);
 
   if (MostGeneralArg) {
     Arg *SingleArg = Args.getLastArg(options::OPT__SLASH_vms);
     Arg *MultipleArg = Args.getLastArg(options::OPT__SLASH_vmm);
     Arg *VirtualArg = Args.getLastArg(options::OPT__SLASH_vmv);
 
     Arg *FirstConflict = SingleArg ? SingleArg : MultipleArg;
     Arg *SecondConflict = VirtualArg ? VirtualArg : MultipleArg;
     if (FirstConflict && SecondConflict && FirstConflict != SecondConflict)
       D.Diag(clang::diag::err_drv_argument_not_allowed_with)
           << FirstConflict->getAsString(Args)
           << SecondConflict->getAsString(Args);
 
     if (SingleArg)
       CmdArgs.push_back("-fms-memptr-rep=single");
     else if (MultipleArg)
       CmdArgs.push_back("-fms-memptr-rep=multiple");
     else
       CmdArgs.push_back("-fms-memptr-rep=virtual");
   }
 
   // Parse the default calling convention options.
   if (Arg *CCArg =
           Args.getLastArg(options::OPT__SLASH_Gd, options::OPT__SLASH_Gr,
                           options::OPT__SLASH_Gz, options::OPT__SLASH_Gv,
                           options::OPT__SLASH_Gregcall)) {
     unsigned DCCOptId = CCArg->getOption().getID();
     const char *DCCFlag = nullptr;
     bool ArchSupported = true;
     llvm::Triple::ArchType Arch = getToolChain().getArch();
     switch (DCCOptId) {
     case options::OPT__SLASH_Gd:
       DCCFlag = "-fdefault-calling-conv=cdecl";
       break;
     case options::OPT__SLASH_Gr:
       ArchSupported = Arch == llvm::Triple::x86;
       DCCFlag = "-fdefault-calling-conv=fastcall";
       break;
     case options::OPT__SLASH_Gz:
       ArchSupported = Arch == llvm::Triple::x86;
       DCCFlag = "-fdefault-calling-conv=stdcall";
       break;
     case options::OPT__SLASH_Gv:
       ArchSupported = Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64;
       DCCFlag = "-fdefault-calling-conv=vectorcall";
       break;
     case options::OPT__SLASH_Gregcall:
       ArchSupported = Arch == llvm::Triple::x86 || Arch == llvm::Triple::x86_64;
       DCCFlag = "-fdefault-calling-conv=regcall";
       break;
     }
 
     // MSVC doesn't warn if /Gr or /Gz is used on x64, so we don't either.
     if (ArchSupported && DCCFlag)
       CmdArgs.push_back(DCCFlag);
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_vtordisp_mode_EQ))
     A->render(Args, CmdArgs);
 
   if (!Args.hasArg(options::OPT_fdiagnostics_format_EQ)) {
     CmdArgs.push_back("-fdiagnostics-format");
     if (Args.hasArg(options::OPT__SLASH_fallback))
       CmdArgs.push_back("msvc-fallback");
     else
       CmdArgs.push_back("msvc");
   }
 }
 
 visualstudio::Compiler *Clang::getCLFallback() const {
   if (!CLFallback)
     CLFallback.reset(new visualstudio::Compiler(getToolChain()));
   return CLFallback.get();
 }
 
 
 const char *Clang::getBaseInputName(const ArgList &Args,
                                     const InputInfo &Input) {
   return Args.MakeArgString(llvm::sys::path::filename(Input.getBaseInput()));
 }
 
 const char *Clang::getBaseInputStem(const ArgList &Args,
                                     const InputInfoList &Inputs) {
   const char *Str = getBaseInputName(Args, Inputs[0]);
 
   if (const char *End = strrchr(Str, '.'))
     return Args.MakeArgString(std::string(Str, End));
 
   return Str;
 }
 
 const char *Clang::getDependencyFileName(const ArgList &Args,
                                          const InputInfoList &Inputs) {
   // FIXME: Think about this more.
   std::string Res;
 
   if (Arg *OutputOpt = Args.getLastArg(options::OPT_o)) {
     std::string Str(OutputOpt->getValue());
     Res = Str.substr(0, Str.rfind('.'));
   } else {
     Res = getBaseInputStem(Args, Inputs);
   }
   return Args.MakeArgString(Res + ".d");
 }
 
 // Begin ClangAs
 
 void ClangAs::AddMIPSTargetArgs(const ArgList &Args,
                                 ArgStringList &CmdArgs) const {
   StringRef CPUName;
   StringRef ABIName;
   const llvm::Triple &Triple = getToolChain().getTriple();
   mips::getMipsCPUAndABI(Args, Triple, CPUName, ABIName);
 
   CmdArgs.push_back("-target-abi");
   CmdArgs.push_back(ABIName.data());
 }
 
 void ClangAs::AddX86TargetArgs(const ArgList &Args,
                                ArgStringList &CmdArgs) const {
   if (Arg *A = Args.getLastArg(options::OPT_masm_EQ)) {
     StringRef Value = A->getValue();
     if (Value == "intel" || Value == "att") {
       CmdArgs.push_back("-mllvm");
       CmdArgs.push_back(Args.MakeArgString("-x86-asm-syntax=" + Value));
     } else {
       getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument)
           << A->getOption().getName() << Value;
     }
   }
 }
 
 void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
                            const InputInfo &Output, const InputInfoList &Inputs,
                            const ArgList &Args,
                            const char *LinkingOutput) const {
   ArgStringList CmdArgs;
 
   assert(Inputs.size() == 1 && "Unexpected number of inputs.");
   const InputInfo &Input = Inputs[0];
 
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
   const std::string &TripleStr = Triple.getTriple();
   const auto &D = getToolChain().getDriver();
 
   // Don't warn about "clang -w -c foo.s"
   Args.ClaimAllArgs(options::OPT_w);
   // and "clang -emit-llvm -c foo.s"
   Args.ClaimAllArgs(options::OPT_emit_llvm);
 
   claimNoWarnArgs(Args);
 
   // Invoke ourselves in -cc1as mode.
   //
   // FIXME: Implement custom jobs for internal actions.
   CmdArgs.push_back("-cc1as");
 
   // Add the "effective" target triple.
   CmdArgs.push_back("-triple");
   CmdArgs.push_back(Args.MakeArgString(TripleStr));
 
   // Set the output mode, we currently only expect to be used as a real
   // assembler.
   CmdArgs.push_back("-filetype");
   CmdArgs.push_back("obj");
 
   // Set the main file name, so that debug info works even with
   // -save-temps or preprocessed assembly.
   CmdArgs.push_back("-main-file-name");
   CmdArgs.push_back(Clang::getBaseInputName(Args, Input));
 
   // Add the target cpu
   std::string CPU = getCPUName(Args, Triple, /*FromAs*/ true);
   if (!CPU.empty()) {
     CmdArgs.push_back("-target-cpu");
     CmdArgs.push_back(Args.MakeArgString(CPU));
   }
 
   // Add the target features
   getTargetFeatures(getToolChain(), Triple, Args, CmdArgs, true);
 
   // Ignore explicit -force_cpusubtype_ALL option.
   (void)Args.hasArg(options::OPT_force__cpusubtype__ALL);
 
   // Pass along any -I options so we get proper .include search paths.
   Args.AddAllArgs(CmdArgs, options::OPT_I_Group);
 
   // Determine the original source input.
   const Action *SourceAction = &JA;
   while (SourceAction->getKind() != Action::InputClass) {
     assert(!SourceAction->getInputs().empty() && "unexpected root action!");
     SourceAction = SourceAction->getInputs()[0];
   }
 
   // Forward -g and handle debug info related flags, assuming we are dealing
   // with an actual assembly file.
   bool WantDebug = false;
   unsigned DwarfVersion = 0;
   Args.ClaimAllArgs(options::OPT_g_Group);
   if (Arg *A = Args.getLastArg(options::OPT_g_Group)) {
     WantDebug = !A->getOption().matches(options::OPT_g0) &&
                 !A->getOption().matches(options::OPT_ggdb0);
     if (WantDebug)
       DwarfVersion = DwarfVersionNum(A->getSpelling());
   }
   if (DwarfVersion == 0)
     DwarfVersion = getToolChain().GetDefaultDwarfVersion();
 
   codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;
 
   if (SourceAction->getType() == types::TY_Asm ||
       SourceAction->getType() == types::TY_PP_Asm) {
     // You might think that it would be ok to set DebugInfoKind outside of
     // the guard for source type, however there is a test which asserts
     // that some assembler invocation receives no -debug-info-kind,
     // and it's not clear whether that test is just overly restrictive.
     DebugInfoKind = (WantDebug ? codegenoptions::LimitedDebugInfo
                                : codegenoptions::NoDebugInfo);
     // Add the -fdebug-compilation-dir flag if needed.
     addDebugCompDirArg(Args, CmdArgs);
 
     // Set the AT_producer to the clang version when using the integrated
     // assembler on assembly source files.
     CmdArgs.push_back("-dwarf-debug-producer");
     CmdArgs.push_back(Args.MakeArgString(getClangFullVersion()));
 
     // And pass along -I options
     Args.AddAllArgs(CmdArgs, options::OPT_I);
   }
   RenderDebugEnablingArgs(Args, CmdArgs, DebugInfoKind, DwarfVersion,
                           llvm::DebuggerKind::Default);
   RenderDebugInfoCompressionArgs(Args, CmdArgs, D);
 
 
   // Handle -fPIC et al -- the relocation-model affects the assembler
   // for some targets.
   llvm::Reloc::Model RelocationModel;
   unsigned PICLevel;
   bool IsPIE;
   std::tie(RelocationModel, PICLevel, IsPIE) =
       ParsePICArgs(getToolChain(), Args);
 
   const char *RMName = RelocationModelName(RelocationModel);
   if (RMName) {
     CmdArgs.push_back("-mrelocation-model");
     CmdArgs.push_back(RMName);
   }
 
   // Optionally embed the -cc1as level arguments into the debug info, for build
   // analysis.
   if (getToolChain().UseDwarfDebugFlags()) {
     ArgStringList OriginalArgs;
     for (const auto &Arg : Args)
       Arg->render(Args, OriginalArgs);
 
     SmallString<256> Flags;
     const char *Exec = getToolChain().getDriver().getClangProgramPath();
     Flags += Exec;
     for (const char *OriginalArg : OriginalArgs) {
       SmallString<128> EscapedArg;
       EscapeSpacesAndBackslashes(OriginalArg, EscapedArg);
       Flags += " ";
       Flags += EscapedArg;
     }
     CmdArgs.push_back("-dwarf-debug-flags");
     CmdArgs.push_back(Args.MakeArgString(Flags));
   }
 
   // FIXME: Add -static support, once we have it.
 
   // Add target specific flags.
   switch (getToolChain().getArch()) {
   default:
     break;
 
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
     AddMIPSTargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
     AddX86TargetArgs(Args, CmdArgs);
     break;
 
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
     // This isn't in AddARMTargetArgs because we want to do this for assembly
     // only, not C/C++.
     if (Args.hasFlag(options::OPT_mdefault_build_attributes,
                      options::OPT_mno_default_build_attributes, true)) {
         CmdArgs.push_back("-mllvm");
         CmdArgs.push_back("-arm-add-build-attributes");
     }
     break;
   }
 
   // Consume all the warning flags. Usually this would be handled more
   // gracefully by -cc1 (warning about unknown warning flags, etc) but -cc1as
   // doesn't handle that so rather than warning about unused flags that are
   // actually used, we'll lie by omission instead.
   // FIXME: Stop lying and consume only the appropriate driver flags
   Args.ClaimAllArgs(options::OPT_W_Group);
 
   CollectArgsForIntegratedAssembler(C, Args, CmdArgs,
                                     getToolChain().getDriver());
 
   Args.AddAllArgs(CmdArgs, options::OPT_mllvm);
 
   assert(Output.isFilename() && "Unexpected lipo output.");
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   assert(Input.isFilename() && "Invalid input.");
   CmdArgs.push_back(Input.getFilename());
 
   const char *Exec = getToolChain().getDriver().getClangProgramPath();
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 
   // Handle the debug info splitting at object creation time if we're
   // creating an object.
   // TODO: Currently only works on linux with newer objcopy.
   if (Args.hasArg(options::OPT_gsplit_dwarf) &&
       getToolChain().getTriple().isOSLinux())
     SplitDebugInfo(getToolChain(), C, *this, JA, Args, Output,
                    SplitDebugName(Args, Input));
 }
 
 // Begin OffloadBundler
 
 void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
                                   const InputInfo &Output,
                                   const InputInfoList &Inputs,
                                   const llvm::opt::ArgList &TCArgs,
                                   const char *LinkingOutput) const {
   // The version with only one output is expected to refer to a bundling job.
   assert(isa<OffloadBundlingJobAction>(JA) && "Expecting bundling job!");
 
   // The bundling command looks like this:
   // clang-offload-bundler -type=bc
   //   -targets=host-triple,openmp-triple1,openmp-triple2
   //   -outputs=input_file
   //   -inputs=unbundle_file_host,unbundle_file_tgt1,unbundle_file_tgt2"
 
   ArgStringList CmdArgs;
 
   // Get the type.
   CmdArgs.push_back(TCArgs.MakeArgString(
       Twine("-type=") + types::getTypeTempSuffix(Output.getType())));
 
   assert(JA.getInputs().size() == Inputs.size() &&
          "Not have inputs for all dependence actions??");
 
   // Get the targets.
   SmallString<128> Triples;
   Triples += "-targets=";
   for (unsigned I = 0; I < Inputs.size(); ++I) {
     if (I)
       Triples += ',';
 
     // Find ToolChain for this input.
     Action::OffloadKind CurKind = Action::OFK_Host;
     const ToolChain *CurTC = &getToolChain();
     const Action *CurDep = JA.getInputs()[I];
 
     if (const auto *OA = dyn_cast<OffloadAction>(CurDep)) {
       CurTC = nullptr;
       OA->doOnEachDependence([&](Action *A, const ToolChain *TC, const char *) {
         assert(CurTC == nullptr && "Expected one dependence!");
         CurKind = A->getOffloadingDeviceKind();
         CurTC = TC;
       });
     }
     Triples += Action::GetOffloadKindName(CurKind);
     Triples += '-';
     Triples += CurTC->getTriple().normalize();
   }
   CmdArgs.push_back(TCArgs.MakeArgString(Triples));
 
   // Get bundled file command.
   CmdArgs.push_back(
       TCArgs.MakeArgString(Twine("-outputs=") + Output.getFilename()));
 
   // Get unbundled files command.
   SmallString<128> UB;
   UB += "-inputs=";
   for (unsigned I = 0; I < Inputs.size(); ++I) {
     if (I)
       UB += ',';
 
     // Find ToolChain for this input.
     const ToolChain *CurTC = &getToolChain();
     if (const auto *OA = dyn_cast<OffloadAction>(JA.getInputs()[I])) {
       CurTC = nullptr;
       OA->doOnEachDependence([&](Action *, const ToolChain *TC, const char *) {
         assert(CurTC == nullptr && "Expected one dependence!");
         CurTC = TC;
       });
     }
     UB += CurTC->getInputFilename(Inputs[I]);
   }
   CmdArgs.push_back(TCArgs.MakeArgString(UB));
 
   // All the inputs are encoded as commands.
   C.addCommand(llvm::make_unique<Command>(
       JA, *this,
       TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
       CmdArgs, None));
 }
 
 void OffloadBundler::ConstructJobMultipleOutputs(
     Compilation &C, const JobAction &JA, const InputInfoList &Outputs,
     const InputInfoList &Inputs, const llvm::opt::ArgList &TCArgs,
     const char *LinkingOutput) const {
   // The version with multiple outputs is expected to refer to a unbundling job.
   auto &UA = cast<OffloadUnbundlingJobAction>(JA);
 
   // The unbundling command looks like this:
   // clang-offload-bundler -type=bc
   //   -targets=host-triple,openmp-triple1,openmp-triple2
   //   -inputs=input_file
   //   -outputs=unbundle_file_host,unbundle_file_tgt1,unbundle_file_tgt2"
   //   -unbundle
 
   ArgStringList CmdArgs;
 
   assert(Inputs.size() == 1 && "Expecting to unbundle a single file!");
   InputInfo Input = Inputs.front();
 
   // Get the type.
   CmdArgs.push_back(TCArgs.MakeArgString(
       Twine("-type=") + types::getTypeTempSuffix(Input.getType())));
 
   // Get the targets.
   SmallString<128> Triples;
   Triples += "-targets=";
   auto DepInfo = UA.getDependentActionsInfo();
   for (unsigned I = 0; I < DepInfo.size(); ++I) {
     if (I)
       Triples += ',';
 
     auto &Dep = DepInfo[I];
     Triples += Action::GetOffloadKindName(Dep.DependentOffloadKind);
     Triples += '-';
     Triples += Dep.DependentToolChain->getTriple().normalize();
   }
 
   CmdArgs.push_back(TCArgs.MakeArgString(Triples));
 
   // Get bundled file command.
   CmdArgs.push_back(
       TCArgs.MakeArgString(Twine("-inputs=") + Input.getFilename()));
 
   // Get unbundled files command.
   SmallString<128> UB;
   UB += "-outputs=";
   for (unsigned I = 0; I < Outputs.size(); ++I) {
     if (I)
       UB += ',';
     UB += DepInfo[I].DependentToolChain->getInputFilename(Outputs[I]);
   }
   CmdArgs.push_back(TCArgs.MakeArgString(UB));
   CmdArgs.push_back("-unbundle");
 
   // All the inputs are encoded as commands.
   C.addCommand(llvm::make_unique<Command>(
       JA, *this,
       TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
       CmdArgs, None));
 }
Index: head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Hexagon.cpp
===================================================================
--- head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Hexagon.cpp	(revision 328752)
+++ head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Hexagon.cpp	(revision 328753)
@@ -1,583 +1,583 @@
 //===--- Hexagon.cpp - Hexagon ToolChain Implementations --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 #include "Hexagon.h"
 #include "InputInfo.h"
 #include "CommonArgs.h"
 #include "clang/Basic/VirtualFileSystem.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 using namespace clang;
 using namespace llvm::opt;
 
 // Default hvx-length for various versions.
 static StringRef getDefaultHvxLength(StringRef Cpu) {
   return llvm::StringSwitch<StringRef>(Cpu)
       .Case("v60", "64b")
       .Case("v62", "64b")
       .Case("v65", "64b")
       .Default("128b");
 }
 
 static void handleHVXWarnings(const Driver &D, const ArgList &Args) {
   // Handle deprecated HVX double warnings.
   if (Arg *A = Args.getLastArg(options::OPT_mhexagon_hvx_double))
     D.Diag(diag::warn_drv_deprecated_arg)
         << A->getAsString(Args) << "-mhvx-length=128B";
   if (Arg *A = Args.getLastArg(options::OPT_mno_hexagon_hvx_double))
     D.Diag(diag::warn_drv_deprecated_arg) << A->getAsString(Args) << "-mno-hvx";
   // Handle the unsupported values passed to mhvx-length.
   if (Arg *A = Args.getLastArg(options::OPT_mhexagon_hvx_length_EQ)) {
     StringRef Val = A->getValue();
-    if (Val != "64B" && Val != "128B")
+    if (!Val.equals_lower("64b") && !Val.equals_lower("128b"))
       D.Diag(diag::err_drv_unsupported_option_argument)
           << A->getOption().getName() << Val;
   }
 }
 
 // Handle hvx target features explicitly.
 static void handleHVXTargetFeatures(const Driver &D, const ArgList &Args,
                                     std::vector<StringRef> &Features,
                                     bool &HasHVX) {
   // Handle HVX warnings.
   handleHVXWarnings(D, Args);
 
   // Add the +hvx* features based on commandline flags.
   StringRef HVXFeature, HVXLength;
   StringRef Cpu(toolchains::HexagonToolChain::GetTargetCPUVersion(Args));
 
   // Handle -mhvx, -mhvx=, -mno-hvx, -mno-hvx-double.
   if (Arg *A = Args.getLastArg(
           options::OPT_mno_hexagon_hvx, options::OPT_mno_hexagon_hvx_double,
           options::OPT_mhexagon_hvx, options::OPT_mhexagon_hvx_EQ)) {
     if (A->getOption().matches(options::OPT_mno_hexagon_hvx) ||
         A->getOption().matches(options::OPT_mno_hexagon_hvx_double)) {
       return;
     } else if (A->getOption().matches(options::OPT_mhexagon_hvx_EQ)) {
       HasHVX = true;
       HVXFeature = Cpu = A->getValue();
       HVXFeature = Args.MakeArgString(llvm::Twine("+hvx") + HVXFeature.lower());
     } else if (A->getOption().matches(options::OPT_mhexagon_hvx)) {
       HasHVX = true;
       HVXFeature = Args.MakeArgString(llvm::Twine("+hvx") + Cpu);
     }
     Features.push_back(HVXFeature);
   }
 
   // Handle -mhvx-length=, -mhvx-double.
   if (Arg *A = Args.getLastArg(options::OPT_mhexagon_hvx_length_EQ,
                                options::OPT_mhexagon_hvx_double)) {
     // These falgs are valid only if HVX in enabled.
     if (!HasHVX)
       D.Diag(diag::err_drv_invalid_hvx_length);
     else if (A->getOption().matches(options::OPT_mhexagon_hvx_length_EQ))
       HVXLength = A->getValue();
     else if (A->getOption().matches(options::OPT_mhexagon_hvx_double))
       HVXLength = "128b";
   }
   // Default hvx-length based on Cpu.
   else if (HasHVX)
     HVXLength = getDefaultHvxLength(Cpu);
 
   if (!HVXLength.empty()) {
     HVXFeature =
         Args.MakeArgString(llvm::Twine("+hvx-length") + HVXLength.lower());
     Features.push_back(HVXFeature);
   }
 }
 
 // Hexagon target features.
 void hexagon::getHexagonTargetFeatures(const Driver &D, const ArgList &Args,
                                        std::vector<StringRef> &Features) {
   handleTargetFeaturesGroup(Args, Features,
                             options::OPT_m_hexagon_Features_Group);
 
   bool UseLongCalls = false;
   if (Arg *A = Args.getLastArg(options::OPT_mlong_calls,
                                options::OPT_mno_long_calls)) {
     if (A->getOption().matches(options::OPT_mlong_calls))
       UseLongCalls = true;
   }
 
   Features.push_back(UseLongCalls ? "+long-calls" : "-long-calls");
 
   bool HasHVX(false);
   handleHVXTargetFeatures(D, Args, Features, HasHVX);
 }
 
 // Hexagon tools start.
 void hexagon::Assembler::RenderExtraToolArgs(const JobAction &JA,
                                              ArgStringList &CmdArgs) const {
 }
 
 void hexagon::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                       const InputInfo &Output,
                                       const InputInfoList &Inputs,
                                       const ArgList &Args,
                                       const char *LinkingOutput) const {
   claimNoWarnArgs(Args);
 
   auto &HTC = static_cast<const toolchains::HexagonToolChain&>(getToolChain());
   const Driver &D = HTC.getDriver();
   ArgStringList CmdArgs;
 
   CmdArgs.push_back("-march=hexagon");
 
   RenderExtraToolArgs(JA, CmdArgs);
 
   const char *AsName = "hexagon-llvm-mc";
   CmdArgs.push_back("-filetype=obj");
   CmdArgs.push_back(Args.MakeArgString(
       "-mcpu=hexagon" +
       toolchains::HexagonToolChain::GetTargetCPUVersion(Args)));
 
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
   } else {
     assert(Output.isNothing() && "Unexpected output");
     CmdArgs.push_back("-fsyntax-only");
   }
 
   if (auto G = toolchains::HexagonToolChain::getSmallDataThreshold(Args)) {
     CmdArgs.push_back(Args.MakeArgString("-gpsize=" + Twine(G.getValue())));
   }
 
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler);
 
   // Only pass -x if gcc will understand it; otherwise hope gcc
   // understands the suffix correctly. The main use case this would go
   // wrong in is for linker inputs if they happened to have an odd
   // suffix; really the only way to get this to happen is a command
   // like '-x foobar a.c' which will treat a.c like a linker input.
   //
   // FIXME: For the linker case specifically, can we safely convert
   // inputs into '-Wl,' options?
   for (const auto &II : Inputs) {
     // Don't try to pass LLVM or AST inputs to a generic gcc.
     if (types::isLLVMIR(II.getType()))
       D.Diag(clang::diag::err_drv_no_linker_llvm_support)
           << HTC.getTripleString();
     else if (II.getType() == types::TY_AST)
       D.Diag(clang::diag::err_drv_no_ast_support)
           << HTC.getTripleString();
     else if (II.getType() == types::TY_ModuleFile)
       D.Diag(diag::err_drv_no_module_support)
           << HTC.getTripleString();
 
     if (II.isFilename())
       CmdArgs.push_back(II.getFilename());
     else
       // Don't render as input, we need gcc to do the translations.
       // FIXME: What is this?
       II.getInputArg().render(Args, CmdArgs);
   }
 
   auto *Exec = Args.MakeArgString(HTC.GetProgramPath(AsName));
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
 void hexagon::Linker::RenderExtraToolArgs(const JobAction &JA,
                                           ArgStringList &CmdArgs) const {
 }
 
 static void
 constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
                          const toolchains::HexagonToolChain &HTC,
                          const InputInfo &Output, const InputInfoList &Inputs,
                          const ArgList &Args, ArgStringList &CmdArgs,
                          const char *LinkingOutput) {
 
   const Driver &D = HTC.getDriver();
 
   //----------------------------------------------------------------------------
   //
   //----------------------------------------------------------------------------
   bool IsStatic = Args.hasArg(options::OPT_static);
   bool IsShared = Args.hasArg(options::OPT_shared);
   bool IsPIE = Args.hasArg(options::OPT_pie);
   bool IncStdLib = !Args.hasArg(options::OPT_nostdlib);
   bool IncStartFiles = !Args.hasArg(options::OPT_nostartfiles);
   bool IncDefLibs = !Args.hasArg(options::OPT_nodefaultlibs);
   bool UseG0 = false;
   bool UseShared = IsShared && !IsStatic;
 
   //----------------------------------------------------------------------------
   // Silence warnings for various options
   //----------------------------------------------------------------------------
   Args.ClaimAllArgs(options::OPT_g_Group);
   Args.ClaimAllArgs(options::OPT_emit_llvm);
   Args.ClaimAllArgs(options::OPT_w); // Other warning options are already
                                      // handled somewhere else.
   Args.ClaimAllArgs(options::OPT_static_libgcc);
 
   //----------------------------------------------------------------------------
   //
   //----------------------------------------------------------------------------
   if (Args.hasArg(options::OPT_s))
     CmdArgs.push_back("-s");
 
   if (Args.hasArg(options::OPT_r))
     CmdArgs.push_back("-r");
 
   for (const auto &Opt : HTC.ExtraOpts)
     CmdArgs.push_back(Opt.c_str());
 
   CmdArgs.push_back("-march=hexagon");
   StringRef CpuVer = toolchains::HexagonToolChain::GetTargetCPUVersion(Args);
   CmdArgs.push_back(Args.MakeArgString("-mcpu=hexagon" + CpuVer));
 
   if (IsShared) {
     CmdArgs.push_back("-shared");
     // The following should be the default, but doing as hexagon-gcc does.
     CmdArgs.push_back("-call_shared");
   }
 
   if (IsStatic)
     CmdArgs.push_back("-static");
 
   if (IsPIE && !IsShared)
     CmdArgs.push_back("-pie");
 
   if (auto G = toolchains::HexagonToolChain::getSmallDataThreshold(Args)) {
     CmdArgs.push_back(Args.MakeArgString("-G" + Twine(G.getValue())));
     UseG0 = G.getValue() == 0;
   }
 
   //----------------------------------------------------------------------------
   //
   //----------------------------------------------------------------------------
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
   //----------------------------------------------------------------------------
   // moslib
   //----------------------------------------------------------------------------
   std::vector<std::string> OsLibs;
   bool HasStandalone = false;
 
   for (const Arg *A : Args.filtered(options::OPT_moslib_EQ)) {
     A->claim();
     OsLibs.emplace_back(A->getValue());
     HasStandalone = HasStandalone || (OsLibs.back() == "standalone");
   }
   if (OsLibs.empty()) {
     OsLibs.push_back("standalone");
     HasStandalone = true;
   }
 
   //----------------------------------------------------------------------------
   // Start Files
   //----------------------------------------------------------------------------
   const std::string MCpuSuffix = "/" + CpuVer.str();
   const std::string MCpuG0Suffix = MCpuSuffix + "/G0";
   const std::string RootDir =
       HTC.getHexagonTargetDir(D.InstalledDir, D.PrefixDirs) + "/";
   const std::string StartSubDir =
       "hexagon/lib" + (UseG0 ? MCpuG0Suffix : MCpuSuffix);
 
   auto Find = [&HTC] (const std::string &RootDir, const std::string &SubDir,
                       const char *Name) -> std::string {
     std::string RelName = SubDir + Name;
     std::string P = HTC.GetFilePath(RelName.c_str());
     if (llvm::sys::fs::exists(P))
       return P;
     return RootDir + RelName;
   };
 
   if (IncStdLib && IncStartFiles) {
     if (!IsShared) {
       if (HasStandalone) {
         std::string Crt0SA = Find(RootDir, StartSubDir, "/crt0_standalone.o");
         CmdArgs.push_back(Args.MakeArgString(Crt0SA));
       }
       std::string Crt0 = Find(RootDir, StartSubDir, "/crt0.o");
       CmdArgs.push_back(Args.MakeArgString(Crt0));
     }
     std::string Init = UseShared
           ? Find(RootDir, StartSubDir + "/pic", "/initS.o")
           : Find(RootDir, StartSubDir, "/init.o");
     CmdArgs.push_back(Args.MakeArgString(Init));
   }
 
   //----------------------------------------------------------------------------
   // Library Search Paths
   //----------------------------------------------------------------------------
   const ToolChain::path_list &LibPaths = HTC.getFilePaths();
   for (const auto &LibPath : LibPaths)
     CmdArgs.push_back(Args.MakeArgString(StringRef("-L") + LibPath));
 
   //----------------------------------------------------------------------------
   //
   //----------------------------------------------------------------------------
   Args.AddAllArgs(CmdArgs,
                   {options::OPT_T_Group, options::OPT_e, options::OPT_s,
                    options::OPT_t, options::OPT_u_Group});
 
   AddLinkerInputs(HTC, Inputs, Args, CmdArgs, JA);
 
   //----------------------------------------------------------------------------
   // Libraries
   //----------------------------------------------------------------------------
   if (IncStdLib && IncDefLibs) {
     if (D.CCCIsCXX()) {
       if (HTC.ShouldLinkCXXStdlib(Args))
         HTC.AddCXXStdlibLibArgs(Args, CmdArgs);
       CmdArgs.push_back("-lm");
     }
 
     CmdArgs.push_back("--start-group");
 
     if (!IsShared) {
       for (StringRef Lib : OsLibs)
         CmdArgs.push_back(Args.MakeArgString("-l" + Lib));
       CmdArgs.push_back("-lc");
     }
     CmdArgs.push_back("-lgcc");
 
     CmdArgs.push_back("--end-group");
   }
 
   //----------------------------------------------------------------------------
   // End files
   //----------------------------------------------------------------------------
   if (IncStdLib && IncStartFiles) {
     std::string Fini = UseShared
           ? Find(RootDir, StartSubDir + "/pic", "/finiS.o")
           : Find(RootDir, StartSubDir, "/fini.o");
     CmdArgs.push_back(Args.MakeArgString(Fini));
   }
 }
 
 void hexagon::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfo &Output,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
   auto &HTC = static_cast<const toolchains::HexagonToolChain&>(getToolChain());
 
   ArgStringList CmdArgs;
   constructHexagonLinkArgs(C, JA, HTC, Output, Inputs, Args, CmdArgs,
                            LinkingOutput);
 
   std::string Linker = HTC.GetProgramPath("hexagon-link");
   C.addCommand(llvm::make_unique<Command>(JA, *this, Args.MakeArgString(Linker),
                                           CmdArgs, Inputs));
 }
 // Hexagon tools end.
 
 /// Hexagon Toolchain
 
 std::string HexagonToolChain::getHexagonTargetDir(
       const std::string &InstalledDir,
       const SmallVectorImpl<std::string> &PrefixDirs) const {
   std::string InstallRelDir;
   const Driver &D = getDriver();
 
   // Locate the rest of the toolchain ...
   for (auto &I : PrefixDirs)
     if (D.getVFS().exists(I))
       return I;
 
   if (getVFS().exists(InstallRelDir = InstalledDir + "/../target"))
     return InstallRelDir;
 
   return InstalledDir;
 }
 
 Optional<unsigned> HexagonToolChain::getSmallDataThreshold(
       const ArgList &Args) {
   StringRef Gn = "";
   if (Arg *A = Args.getLastArg(options::OPT_G)) {
     Gn = A->getValue();
   } else if (Args.getLastArg(options::OPT_shared, options::OPT_fpic,
                              options::OPT_fPIC)) {
     Gn = "0";
   }
 
   unsigned G;
   if (!Gn.getAsInteger(10, G))
     return G;
 
   return None;
 }
 
 void HexagonToolChain::getHexagonLibraryPaths(const ArgList &Args,
       ToolChain::path_list &LibPaths) const {
   const Driver &D = getDriver();
 
   //----------------------------------------------------------------------------
   // -L Args
   //----------------------------------------------------------------------------
   for (Arg *A : Args.filtered(options::OPT_L))
     for (const char *Value : A->getValues())
       LibPaths.push_back(Value);
 
   //----------------------------------------------------------------------------
   // Other standard paths
   //----------------------------------------------------------------------------
   std::vector<std::string> RootDirs;
   std::copy(D.PrefixDirs.begin(), D.PrefixDirs.end(),
             std::back_inserter(RootDirs));
 
   std::string TargetDir = getHexagonTargetDir(D.getInstalledDir(),
                                               D.PrefixDirs);
   if (std::find(RootDirs.begin(), RootDirs.end(), TargetDir) == RootDirs.end())
     RootDirs.push_back(TargetDir);
 
   bool HasPIC = Args.hasArg(options::OPT_fpic, options::OPT_fPIC);
   // Assume G0 with -shared.
   bool HasG0 = Args.hasArg(options::OPT_shared);
   if (auto G = getSmallDataThreshold(Args))
     HasG0 = G.getValue() == 0;
 
   const std::string CpuVer = GetTargetCPUVersion(Args).str();
   for (auto &Dir : RootDirs) {
     std::string LibDir = Dir + "/hexagon/lib";
     std::string LibDirCpu = LibDir + '/' + CpuVer;
     if (HasG0) {
       if (HasPIC)
         LibPaths.push_back(LibDirCpu + "/G0/pic");
       LibPaths.push_back(LibDirCpu + "/G0");
     }
     LibPaths.push_back(LibDirCpu);
     LibPaths.push_back(LibDir);
   }
 }
 
 HexagonToolChain::HexagonToolChain(const Driver &D, const llvm::Triple &Triple,
                                    const llvm::opt::ArgList &Args)
     : Linux(D, Triple, Args) {
   const std::string TargetDir = getHexagonTargetDir(D.getInstalledDir(),
                                                     D.PrefixDirs);
 
   // Note: Generic_GCC::Generic_GCC adds InstalledDir and getDriver().Dir to
   // program paths
   const std::string BinDir(TargetDir + "/bin");
   if (D.getVFS().exists(BinDir))
     getProgramPaths().push_back(BinDir);
 
   ToolChain::path_list &LibPaths = getFilePaths();
 
   // Remove paths added by Linux toolchain. Currently Hexagon_TC really targets
   // 'elf' OS type, so the Linux paths are not appropriate. When we actually
   // support 'linux' we'll need to fix this up
   LibPaths.clear();
   getHexagonLibraryPaths(Args, LibPaths);
 }
 
 HexagonToolChain::~HexagonToolChain() {}
 
 Tool *HexagonToolChain::buildAssembler() const {
   return new tools::hexagon::Assembler(*this);
 }
 
 Tool *HexagonToolChain::buildLinker() const {
   return new tools::hexagon::Linker(*this);
 }
 
 unsigned HexagonToolChain::getOptimizationLevel(
     const llvm::opt::ArgList &DriverArgs) const {
   // Copied in large part from lib/Frontend/CompilerInvocation.cpp.
   Arg *A = DriverArgs.getLastArg(options::OPT_O_Group);
   if (!A)
     return 0;
 
   if (A->getOption().matches(options::OPT_O0))
     return 0;
   if (A->getOption().matches(options::OPT_Ofast) ||
       A->getOption().matches(options::OPT_O4))
     return 3;
   assert(A->getNumValues() != 0);
   StringRef S(A->getValue());
   if (S == "s" || S == "z" || S.empty())
     return 2;
   if (S == "g")
     return 1;
 
   unsigned OptLevel;
   if (S.getAsInteger(10, OptLevel))
     return 0;
   return OptLevel;
 }
 
 void HexagonToolChain::addClangTargetOptions(const ArgList &DriverArgs,
                                              ArgStringList &CC1Args,
                                              Action::OffloadKind) const {
   if (DriverArgs.hasArg(options::OPT_ffp_contract))
     return;
   unsigned OptLevel = getOptimizationLevel(DriverArgs);
   if (OptLevel >= 3)
     CC1Args.push_back("-ffp-contract=fast");
 }
 
 void HexagonToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
                                                  ArgStringList &CC1Args) const {
   if (DriverArgs.hasArg(options::OPT_nostdinc) ||
       DriverArgs.hasArg(options::OPT_nostdlibinc))
     return;
 
   const Driver &D = getDriver();
   std::string TargetDir = getHexagonTargetDir(D.getInstalledDir(),
                                               D.PrefixDirs);
   addExternCSystemInclude(DriverArgs, CC1Args, TargetDir + "/hexagon/include");
 }
 
 
 void HexagonToolChain::addLibStdCxxIncludePaths(
     const llvm::opt::ArgList &DriverArgs,
     llvm::opt::ArgStringList &CC1Args) const {
   const Driver &D = getDriver();
   std::string TargetDir = getHexagonTargetDir(D.InstalledDir, D.PrefixDirs);
   addLibStdCXXIncludePaths(TargetDir, "/hexagon/include/c++", "", "", "", "",
                            DriverArgs, CC1Args);
 }
 
 ToolChain::CXXStdlibType
 HexagonToolChain::GetCXXStdlibType(const ArgList &Args) const {
   Arg *A = Args.getLastArg(options::OPT_stdlib_EQ);
   if (!A)
     return ToolChain::CST_Libstdcxx;
 
   StringRef Value = A->getValue();
   if (Value != "libstdc++")
     getDriver().Diag(diag::err_drv_invalid_stdlib_name) << A->getAsString(Args);
 
   return ToolChain::CST_Libstdcxx;
 }
 
 //
 // Returns the default CPU for Hexagon. This is the default compilation target
 // if no Hexagon processor is selected at the command-line.
 //
 const StringRef HexagonToolChain::GetDefaultCPU() {
   return "hexagonv60";
 }
 
 const StringRef HexagonToolChain::GetTargetCPUVersion(const ArgList &Args) {
   Arg *CpuArg = nullptr;
   if (Arg *A = Args.getLastArg(options::OPT_mcpu_EQ, options::OPT_march_EQ))
     CpuArg = A;
 
   StringRef CPU = CpuArg ? CpuArg->getValue() : GetDefaultCPU();
   if (CPU.startswith("hexagon"))
     return CPU.substr(sizeof("hexagon") - 1);
   return CPU;
 }
Index: head/contrib/llvm/tools/clang
===================================================================
--- head/contrib/llvm/tools/clang	(revision 328752)
+++ head/contrib/llvm/tools/clang	(revision 328753)

Property changes on: head/contrib/llvm/tools/clang
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/clang/dist-release_60:r328374-328749
Index: head/contrib/llvm/tools/lld/ELF/AArch64ErrataFix.cpp
===================================================================
--- head/contrib/llvm/tools/lld/ELF/AArch64ErrataFix.cpp	(revision 328752)
+++ head/contrib/llvm/tools/lld/ELF/AArch64ErrataFix.cpp	(revision 328753)
@@ -1,648 +1,649 @@
 //===- AArch64ErrataFix.cpp -----------------------------------------------===//
 //
 //                             The LLVM Linker
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 // This file implements Section Patching for the purpose of working around
 // errata in CPUs. The general principle is that an erratum sequence of one or
 // more instructions is detected in the instruction stream, one of the
 // instructions in the sequence is replaced with a branch to a patch sequence
 // of replacement instructions. At the end of the replacement sequence the
 // patch branches back to the instruction stream.
 
 // This technique is only suitable for fixing an erratum when:
 // - There is a set of necessary conditions required to trigger the erratum that
 // can be detected at static link time.
 // - There is a set of replacement instructions that can be used to remove at
 // least one of the necessary conditions that trigger the erratum.
 // - We can overwrite an instruction in the erratum sequence with a branch to
 // the replacement sequence.
 // - We can place the replacement sequence within range of the branch.
 
 // FIXME:
 // - The implementation here only supports one patch, the AArch64 Cortex-53
 // errata 843419 that affects r0p0, r0p1, r0p2 and r0p4 versions of the core.
 // To keep the initial version simple there is no support for multiple
 // architectures or selection of different patches.
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ErrataFix.h"
 #include "Config.h"
 #include "LinkerScript.h"
 #include "OutputSections.h"
 #include "Relocations.h"
 #include "Strings.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
 #include "lld/Common/Memory.h"
 
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
 using namespace llvm;
 using namespace llvm::ELF;
 using namespace llvm::object;
+using namespace llvm::support;
 using namespace llvm::support::endian;
 
 using namespace lld;
 using namespace lld::elf;
 
 // Helper functions to identify instructions and conditions needed to trigger
 // the Cortex-A53-843419 erratum.
 
 // ADRP
 // | 1 | immlo (2) | 1 | 0 0 0 0 | immhi (19) | Rd (5) |
 static bool isADRP(uint32_t Instr) {
   return (Instr & 0x9f000000) == 0x90000000;
 }
 
 // Load and store bit patterns from ARMv8-A ARM ARM.
 // Instructions appear in order of appearance starting from table in
 // C4.1.3 Loads and Stores.
 
 // All loads and stores have 1 (at bit postion 27), (0 at bit position 25).
 // | op0 x op1 (2) | 1 op2 0 op3 (2) | x | op4 (5) | xxxx | op5 (2) | x (10) |
 static bool isLoadStoreClass(uint32_t Instr) {
   return (Instr & 0x0a000000) == 0x08000000;
 }
 
 // LDN/STN multiple no offset
 // | 0 Q 00 | 1100 | 0 L 00 | 0000 | opcode (4) | size (2) | Rn (5) | Rt (5) |
 // LDN/STN multiple post-indexed
 // | 0 Q 00 | 1100 | 1 L 0 | Rm (5)| opcode (4) | size (2) | Rn (5) | Rt (5) |
 // L == 0 for stores.
 
 // Utility routine to decode opcode field of LDN/STN multiple structure
 // instructions to find the ST1 instructions.
 // opcode == 0010 ST1 4 registers.
 // opcode == 0110 ST1 3 registers.
 // opcode == 0111 ST1 1 register.
 // opcode == 1010 ST1 2 registers.
 static bool isST1MultipleOpcode(uint32_t Instr) {
   return (Instr & 0x0000f000) == 0x00002000 ||
          (Instr & 0x0000f000) == 0x00006000 ||
          (Instr & 0x0000f000) == 0x00007000 ||
          (Instr & 0x0000f000) == 0x0000a000;
 }
 
 static bool isST1Multiple(uint32_t Instr) {
   return (Instr & 0xbfff0000) == 0x0c000000 && isST1MultipleOpcode(Instr);
 }
 
 // Writes to Rn (writeback).
 static bool isST1MultiplePost(uint32_t Instr) {
   return (Instr & 0xbfe00000) == 0x0c800000 && isST1MultipleOpcode(Instr);
 }
 
 // LDN/STN single no offset
 // | 0 Q 00 | 1101 | 0 L R 0 | 0000 | opc (3) S | size (2) | Rn (5) | Rt (5)|
 // LDN/STN single post-indexed
 // | 0 Q 00 | 1101 | 1 L R | Rm (5) | opc (3) S | size (2) | Rn (5) | Rt (5)|
 // L == 0 for stores
 
 // Utility routine to decode opcode field of LDN/STN single structure
 // instructions to find the ST1 instructions.
 // R == 0 for ST1 and ST3, R == 1 for ST2 and ST4.
 // opcode == 000 ST1 8-bit.
 // opcode == 010 ST1 16-bit.
 // opcode == 100 ST1 32 or 64-bit (Size determines which).
 static bool isST1SingleOpcode(uint32_t Instr) {
   return (Instr & 0x0040e000) == 0x00000000 ||
          (Instr & 0x0040e000) == 0x00004000 ||
          (Instr & 0x0040e000) == 0x00008000;
 }
 
 static bool isST1Single(uint32_t Instr) {
   return (Instr & 0xbfff0000) == 0x0d000000 && isST1SingleOpcode(Instr);
 }
 
 // Writes to Rn (writeback).
 static bool isST1SinglePost(uint32_t Instr) {
   return (Instr & 0xbfe00000) == 0x0d800000 && isST1SingleOpcode(Instr);
 }
 
 static bool isST1(uint32_t Instr) {
   return isST1Multiple(Instr) || isST1MultiplePost(Instr) ||
          isST1Single(Instr) || isST1SinglePost(Instr);
 }
 
 // Load/store exclusive
 // | size (2) 00 | 1000 | o2 L o1 | Rs (5) | o0 | Rt2 (5) | Rn (5) | Rt (5) |
 // L == 0 for Stores.
 static bool isLoadStoreExclusive(uint32_t Instr) {
   return (Instr & 0x3f000000) == 0x08000000;
 }
 
 static bool isLoadExclusive(uint32_t Instr) {
   return (Instr & 0x3f400000) == 0x08400000;
 }
 
 // Load register literal
 // | opc (2) 01 | 1 V 00 | imm19 | Rt (5) |
 static bool isLoadLiteral(uint32_t Instr) {
   return (Instr & 0x3b000000) == 0x18000000;
 }
 
 // Load/store no-allocate pair
 // (offset)
 // | opc (2) 10 | 1 V 00 | 0 L | imm7 | Rt2 (5) | Rn (5) | Rt (5) |
 // L == 0 for stores.
 // Never writes to register
 static bool isSTNP(uint32_t Instr) {
   return (Instr & 0x3bc00000) == 0x28000000;
 }
 
 // Load/store register pair
 // (post-indexed)
 // | opc (2) 10 | 1 V 00 | 1 L | imm7 | Rt2 (5) | Rn (5) | Rt (5) |
 // L == 0 for stores, V == 0 for Scalar, V == 1 for Simd/FP
 // Writes to Rn.
 static bool isSTPPost(uint32_t Instr) {
   return (Instr & 0x3bc00000) == 0x28800000;
 }
 
 // (offset)
 // | opc (2) 10 | 1 V 01 | 0 L | imm7 | Rt2 (5) | Rn (5) | Rt (5) |
 static bool isSTPOffset(uint32_t Instr) {
   return (Instr & 0x3bc00000) == 0x29000000;
 }
 
 // (pre-index)
 // | opc (2) 10 | 1 V 01 | 1 L | imm7 | Rt2 (5) | Rn (5) | Rt (5) |
 // Writes to Rn.
 static bool isSTPPre(uint32_t Instr) {
   return (Instr & 0x3bc00000) == 0x29800000;
 }
 
 static bool isSTP(uint32_t Instr) {
   return isSTPPost(Instr) || isSTPOffset(Instr) || isSTPPre(Instr);
 }
 
 // Load/store register (unscaled immediate)
 // | size (2) 11 | 1 V 00 | opc (2) 0 | imm9 | 00 | Rn (5) | Rt (5) |
 // V == 0 for Scalar, V == 1 for Simd/FP.
 static bool isLoadStoreUnscaled(uint32_t Instr) {
   return (Instr & 0x3b000c00) == 0x38000000;
 }
 
 // Load/store register (immediate post-indexed)
 // | size (2) 11 | 1 V 00 | opc (2) 0 | imm9 | 01 | Rn (5) | Rt (5) |
 static bool isLoadStoreImmediatePost(uint32_t Instr) {
   return (Instr & 0x3b200c00) == 0x38000400;
 }
 
 // Load/store register (unprivileged)
 // | size (2) 11 | 1 V 00 | opc (2) 0 | imm9 | 10 | Rn (5) | Rt (5) |
 static bool isLoadStoreUnpriv(uint32_t Instr) {
   return (Instr & 0x3b200c00) == 0x38000800;
 }
 
 // Load/store register (immediate pre-indexed)
 // | size (2) 11 | 1 V 00 | opc (2) 0 | imm9 | 11 | Rn (5) | Rt (5) |
 static bool isLoadStoreImmediatePre(uint32_t Instr) {
   return (Instr & 0x3b200c00) == 0x38000c00;
 }
 
 // Load/store register (register offset)
 // | size (2) 11 | 1 V 00 | opc (2) 1 | Rm (5) | option (3) S | 10 | Rn | Rt |
 static bool isLoadStoreRegisterOff(uint32_t Instr) {
   return (Instr & 0x3b200c00) == 0x38200800;
 }
 
 // Load/store register (unsigned immediate)
 // | size (2) 11 | 1 V 01 | opc (2) | imm12 | Rn (5) | Rt (5) |
 static bool isLoadStoreRegisterUnsigned(uint32_t Instr) {
   return (Instr & 0x3b000000) == 0x39000000;
 }
 
 // Rt is always in bit position 0 - 4.
 static uint32_t getRt(uint32_t Instr) { return (Instr & 0x1f); }
 
 // Rn is always in bit position 5 - 9.
 static uint32_t getRn(uint32_t Instr) { return (Instr >> 5) & 0x1f; }
 
 // C4.1.2 Branches, Exception Generating and System instructions
 // | op0 (3) 1 | 01 op1 (4) | x (22) |
 // op0 == 010 101 op1 == 0xxx Conditional Branch.
 // op0 == 110 101 op1 == 1xxx Unconditional Branch Register.
 // op0 == x00 101 op1 == xxxx Unconditional Branch immediate.
 // op0 == x01 101 op1 == 0xxx Compare and branch immediate.
 // op0 == x01 101 op1 == 1xxx Test and branch immediate.
 static bool isBranch(uint32_t Instr) {
   return ((Instr & 0xfe000000) == 0xd6000000) || // Cond branch.
          ((Instr & 0xfe000000) == 0x54000000) || // Uncond branch reg.
          ((Instr & 0x7c000000) == 0x14000000) || // Uncond branch imm.
          ((Instr & 0x7c000000) == 0x34000000);   // Compare and test branch.
 }
 
 static bool isV8SingleRegisterNonStructureLoadStore(uint32_t Instr) {
   return isLoadStoreUnscaled(Instr) || isLoadStoreImmediatePost(Instr) ||
          isLoadStoreUnpriv(Instr) || isLoadStoreImmediatePre(Instr) ||
          isLoadStoreRegisterOff(Instr) || isLoadStoreRegisterUnsigned(Instr);
 }
 
 // Note that this function refers to v8.0 only and does not include the
 // additional load and store instructions added for in later revisions of
 // the architecture such as the Atomic memory operations introduced
 // in v8.1.
 static bool isV8NonStructureLoad(uint32_t Instr) {
   if (isLoadExclusive(Instr))
     return true;
   if (isLoadLiteral(Instr))
     return true;
   else if (isV8SingleRegisterNonStructureLoadStore(Instr)) {
     // For Load and Store single register, Loads are derived from a
     // combination of the Size, V and Opc fields.
     uint32_t Size = (Instr >> 30) & 0xff;
     uint32_t V = (Instr >> 26) & 0x1;
     uint32_t Opc = (Instr >> 22) & 0x3;
     // For the load and store instructions that we are decoding.
     // Opc == 0 are all stores.
     // Opc == 1 with a couple of exceptions are loads. The exceptions are:
     // Size == 00 (0), V == 1, Opc == 10 (2) which is a store and
     // Size == 11 (3), V == 0, Opc == 10 (2) which is a prefetch.
     return Opc != 0 && !(Size == 0 && V == 1 && Opc == 2) &&
            !(Size == 3 && V == 0 && Opc == 2);
   }
   return false;
 }
 
 // The following decode instructions are only complete up to the instructions
 // needed for errata 843419.
 
 // Instruction with writeback updates the index register after the load/store.
 static bool hasWriteback(uint32_t Instr) {
   return isLoadStoreImmediatePre(Instr) || isLoadStoreImmediatePost(Instr) ||
          isSTPPre(Instr) || isSTPPost(Instr) || isST1SinglePost(Instr) ||
          isST1MultiplePost(Instr);
 }
 
 // For the load and store class of instructions, a load can write to the
 // destination register, a load and a store can write to the base register when
 // the instruction has writeback.
 static bool doesLoadStoreWriteToReg(uint32_t Instr, uint32_t Reg) {
   return (isV8NonStructureLoad(Instr) && getRt(Instr) == Reg) ||
          (hasWriteback(Instr) && getRn(Instr) == Reg);
 }
 
 // Scanner for Cortex-A53 errata 843419
 // Full details are available in the Cortex A53 MPCore revision 0 Software
 // Developers Errata Notice (ARM-EPM-048406).
 //
 // The instruction sequence that triggers the erratum is common in compiled
 // AArch64 code, however it is sensitive to the offset of the sequence within
 // a 4k page. This means that by scanning and fixing the patch after we have
 // assigned addresses we only need to disassemble and fix instances of the
 // sequence in the range of affected offsets.
 //
 // In summary the erratum conditions are a series of 4 instructions:
 // 1.) An ADRP instruction that writes to register Rn with low 12 bits of
 //     address of instruction either 0xff8 or 0xffc.
 // 2.) A load or store instruction that can be:
 // - A single register load or store, of either integer or vector registers.
 // - An STP or STNP, of either integer or vector registers.
 // - An Advanced SIMD ST1 store instruction.
 // - Must not write to Rn, but may optionally read from it.
 // 3.) An optional instruction that is not a branch and does not write to Rn.
 // 4.) A load or store from the  Load/store register (unsigned immediate) class
 //     that uses Rn as the base address register.
 //
 // Note that we do not attempt to scan for Sequence 2 as described in the
 // Software Developers Errata Notice as this has been assessed to be extremely
 // unlikely to occur in compiled code. This matches gold and ld.bfd behavior.
 
 // Return true if the Instruction sequence Adrp, Instr2, and Instr4 match
 // the erratum sequence. The Adrp, Instr2 and Instr4 correspond to 1.), 2.),
 // and 4.) in the Scanner for Cortex-A53 errata comment above.
 static bool is843419ErratumSequence(uint32_t Instr1, uint32_t Instr2,
                                     uint32_t Instr4) {
   if (!isADRP(Instr1))
     return false;
 
   uint32_t Rn = getRt(Instr1);
   return isLoadStoreClass(Instr2) &&
          (isLoadStoreExclusive(Instr2) || isLoadLiteral(Instr2) ||
           isV8SingleRegisterNonStructureLoadStore(Instr2) || isSTP(Instr2) ||
           isSTNP(Instr2) || isST1(Instr2)) &&
          !doesLoadStoreWriteToReg(Instr2, Rn) &&
          isLoadStoreRegisterUnsigned(Instr4) && getRn(Instr4) == Rn;
 }
 
 // Scan the instruction sequence starting at Offset Off from the base of
 // InputSection IS. We update Off in this function rather than in the caller as
 // we can skip ahead much further into the section when we know how many
 // instructions we've scanned.
 // Return the offset of the load or store instruction in IS that we want to
 // patch or 0 if no patch required.
 static uint64_t scanCortexA53Errata843419(InputSection *IS, uint64_t &Off,
                                           uint64_t Limit) {
   uint64_t ISAddr = IS->getParent()->Addr + IS->OutSecOff;
 
   // Advance Off so that (ISAddr + Off) modulo 0x1000 is at least 0xff8.
   uint64_t InitialPageOff = (ISAddr + Off) & 0xfff;
   if (InitialPageOff < 0xff8)
     Off += 0xff8 - InitialPageOff;
 
   bool OptionalAllowed = Limit - Off > 12;
   if (Off >= Limit || Limit - Off < 12) {
     // Need at least 3 4-byte sized instructions to trigger erratum.
     Off = Limit;
     return 0;
   }
 
   uint64_t PatchOff = 0;
   const uint8_t *Buf = IS->Data.begin();
-  const uint32_t *InstBuf = reinterpret_cast<const uint32_t *>(Buf + Off);
+  const ulittle32_t *InstBuf = reinterpret_cast<const ulittle32_t *>(Buf + Off);
   uint32_t Instr1 = *InstBuf++;
   uint32_t Instr2 = *InstBuf++;
   uint32_t Instr3 = *InstBuf++;
   if (is843419ErratumSequence(Instr1, Instr2, Instr3)) {
     PatchOff = Off + 8;
   } else if (OptionalAllowed && !isBranch(Instr3)) {
     uint32_t Instr4 = *InstBuf++;
     if (is843419ErratumSequence(Instr1, Instr2, Instr4))
       PatchOff = Off + 12;
   }
   if (((ISAddr + Off) & 0xfff) == 0xff8)
     Off += 4;
   else
     Off += 0xffc;
   return PatchOff;
 }
 
 class lld::elf::Patch843419Section : public SyntheticSection {
 public:
   Patch843419Section(InputSection *P, uint64_t Off);
 
   void writeTo(uint8_t *Buf) override;
 
   size_t getSize() const override { return 8; }
 
   uint64_t getLDSTAddr() const;
 
   // The Section we are patching.
   const InputSection *Patchee;
   // The offset of the instruction in the Patchee section we are patching.
   uint64_t PatcheeOffset;
   // A label for the start of the Patch that we can use as a relocation target.
   Symbol *PatchSym;
 };
 
 lld::elf::Patch843419Section::Patch843419Section(InputSection *P, uint64_t Off)
     : SyntheticSection(SHF_ALLOC | SHF_EXECINSTR, SHT_PROGBITS, 4,
                        ".text.patch"),
       Patchee(P), PatcheeOffset(Off) {
   this->Parent = P->getParent();
   PatchSym = addSyntheticLocal(
       Saver.save("__CortexA53843419_" + utohexstr(getLDSTAddr())), STT_FUNC, 0,
       getSize(), *this);
   addSyntheticLocal(Saver.save("$x"), STT_NOTYPE, 0, 0, *this);
 }
 
 uint64_t lld::elf::Patch843419Section::getLDSTAddr() const {
   return Patchee->getParent()->Addr + Patchee->OutSecOff + PatcheeOffset;
 }
 
 void lld::elf::Patch843419Section::writeTo(uint8_t *Buf) {
   // Copy the instruction that we will be replacing with a branch in the
   // Patchee Section.
   write32le(Buf, read32le(Patchee->Data.begin() + PatcheeOffset));
 
   // Apply any relocation transferred from the original PatcheeSection.
   // For a SyntheticSection Buf already has OutSecOff added, but relocateAlloc
   // also adds OutSecOff so we need to subtract to avoid double counting.
   this->relocateAlloc(Buf - OutSecOff, Buf - OutSecOff + getSize());
 
   // Return address is the next instruction after the one we have just copied.
   uint64_t S = getLDSTAddr() + 4;
   uint64_t P = PatchSym->getVA() + 4;
   Target->relocateOne(Buf + 4, R_AARCH64_JUMP26, S - P);
 }
 
 void AArch64Err843419Patcher::init() {
   // The AArch64 ABI permits data in executable sections. We must avoid scanning
   // this data as if it were instructions to avoid false matches. We use the
   // mapping symbols in the InputObjects to identify this data, caching the
   // results in SectionMap so we don't have to recalculate it each pass.
 
   // The ABI Section 4.5.4 Mapping symbols; defines local symbols that describe
   // half open intervals [Symbol Value, Next Symbol Value) of code and data
   // within sections. If there is no next symbol then the half open interval is
   // [Symbol Value, End of section). The type, code or data, is determined by
   // the mapping symbol name, $x for code, $d for data.
   auto IsCodeMapSymbol = [](const Symbol *B) {
     return B->getName() == "$x" || B->getName().startswith("$x.");
   };
   auto IsDataMapSymbol = [](const Symbol *B) {
     return B->getName() == "$d" || B->getName().startswith("$d.");
   };
 
   // Collect mapping symbols for every executable InputSection.
   for (InputFile *File : ObjectFiles) {
     auto *F = cast<ObjFile<ELF64LE>>(File);
     for (Symbol *B : F->getLocalSymbols()) {
       auto *Def = dyn_cast<Defined>(B);
       if (!Def)
         continue;
       if (!IsCodeMapSymbol(Def) && !IsDataMapSymbol(Def))
         continue;
       if (auto *Sec = dyn_cast<InputSection>(Def->Section))
         if (Sec->Flags & SHF_EXECINSTR)
           SectionMap[Sec].push_back(Def);
     }
   }
   // For each InputSection make sure the mapping symbols are in sorted in
   // ascending order and free from consecutive runs of mapping symbols with
   // the same type. For example we must remove the redundant $d.1 from $x.0
   // $d.0 $d.1 $x.1.
   for (auto &KV : SectionMap) {
     std::vector<const Defined *> &MapSyms = KV.second;
     if (MapSyms.size() <= 1)
       continue;
     std::stable_sort(
         MapSyms.begin(), MapSyms.end(),
         [](const Defined *A, const Defined *B) { return A->Value < B->Value; });
     MapSyms.erase(
         std::unique(MapSyms.begin(), MapSyms.end(),
                     [=](const Defined *A, const Defined *B) {
                       return (IsCodeMapSymbol(A) && IsCodeMapSymbol(B)) ||
                              (IsDataMapSymbol(A) && IsDataMapSymbol(B));
                     }),
         MapSyms.end());
   }
   Initialized = true;
 }
 
 // Insert the PatchSections we have created back into the
 // InputSectionDescription. As inserting patches alters the addresses of
 // InputSections that follow them, we try and place the patches after all the
 // executable sections, although we may need to insert them earlier if the
 // InputSectionDescription is larger than the maximum branch range.
 void AArch64Err843419Patcher::insertPatches(
     InputSectionDescription &ISD, std::vector<Patch843419Section *> &Patches) {
   uint64_t ISLimit;
   uint64_t PrevISLimit = ISD.Sections.front()->OutSecOff;
   uint64_t PatchUpperBound = PrevISLimit + Target->ThunkSectionSpacing;
 
   // Set the OutSecOff of patches to the place where we want to insert them.
   // We use a similar strategy to Thunk placement. Place patches roughly
   // every multiple of maximum branch range.
   auto PatchIt = Patches.begin();
   auto PatchEnd = Patches.end();
   for (const InputSection *IS : ISD.Sections) {
     ISLimit = IS->OutSecOff + IS->getSize();
     if (ISLimit > PatchUpperBound) {
       while (PatchIt != PatchEnd) {
         if ((*PatchIt)->getLDSTAddr() >= PrevISLimit)
           break;
         (*PatchIt)->OutSecOff = PrevISLimit;
         ++PatchIt;
       }
       PatchUpperBound = PrevISLimit + Target->ThunkSectionSpacing;
     }
     PrevISLimit = ISLimit;
   }
   for (; PatchIt != PatchEnd; ++PatchIt) {
     (*PatchIt)->OutSecOff = ISLimit;
   }
 
   // merge all patch sections. We use the OutSecOff assigned above to
   // determine the insertion point. This is ok as we only merge into an
   // InputSectionDescription once per pass, and at the end of the pass
   // assignAddresses() will recalculate all the OutSecOff values.
   std::vector<InputSection *> Tmp;
   Tmp.reserve(ISD.Sections.size() + Patches.size());
   auto MergeCmp = [](const InputSection *A, const InputSection *B) {
     if (A->OutSecOff < B->OutSecOff)
       return true;
     if (A->OutSecOff == B->OutSecOff && isa<Patch843419Section>(A) &&
         !isa<Patch843419Section>(B))
       return true;
     return false;
   };
   std::merge(ISD.Sections.begin(), ISD.Sections.end(), Patches.begin(),
              Patches.end(), std::back_inserter(Tmp), MergeCmp);
   ISD.Sections = std::move(Tmp);
 }
 
 // Given an erratum sequence that starts at address AdrpAddr, with an
 // instruction that we need to patch at PatcheeOffset from the start of
 // InputSection IS, create a Patch843419 Section and add it to the
 // Patches that we need to insert.
 static void implementPatch(uint64_t AdrpAddr, uint64_t PatcheeOffset,
                            InputSection *IS,
                            std::vector<Patch843419Section *> &Patches) {
   // There may be a relocation at the same offset that we are patching. There
   // are three cases that we need to consider.
   // Case 1: R_AARCH64_JUMP26 branch relocation. We have already patched this
   // instance of the erratum on a previous patch and altered the relocation. We
   // have nothing more to do.
   // Case 2: A load/store register (unsigned immediate) class relocation. There
   // are two of these R_AARCH_LD64_ABS_LO12_NC and R_AARCH_LD64_GOT_LO12_NC and
   // they are both absolute. We need to add the same relocation to the patch,
   // and replace the relocation with a R_AARCH_JUMP26 branch relocation.
   // Case 3: No relocation. We must create a new R_AARCH64_JUMP26 branch
   // relocation at the offset.
   auto RelIt = std::find_if(
       IS->Relocations.begin(), IS->Relocations.end(),
       [=](const Relocation &R) { return R.Offset == PatcheeOffset; });
   if (RelIt != IS->Relocations.end() && RelIt->Type == R_AARCH64_JUMP26)
     return;
 
   if (Config->Verbose)
     message("detected cortex-a53-843419 erratum sequence starting at " +
             utohexstr(AdrpAddr) + " in unpatched output.");
 
   auto *PS = make<Patch843419Section>(IS, PatcheeOffset);
   Patches.push_back(PS);
 
   auto MakeRelToPatch = [](uint64_t Offset, Symbol *PatchSym) {
     return Relocation{R_PC, R_AARCH64_JUMP26, Offset, 0, PatchSym};
   };
 
   if (RelIt != IS->Relocations.end()) {
     PS->Relocations.push_back(
         {RelIt->Expr, RelIt->Type, 0, RelIt->Addend, RelIt->Sym});
     *RelIt = MakeRelToPatch(PatcheeOffset, PS->PatchSym);
   } else
     IS->Relocations.push_back(MakeRelToPatch(PatcheeOffset, PS->PatchSym));
 }
 
 // Scan all the instructions in InputSectionDescription, for each instance of
 // the erratum sequence create a Patch843419Section. We return the list of
 // Patch843419Sections that need to be applied to ISD.
 std::vector<Patch843419Section *>
 AArch64Err843419Patcher::patchInputSectionDescription(
     InputSectionDescription &ISD) {
   std::vector<Patch843419Section *> Patches;
   for (InputSection *IS : ISD.Sections) {
     //  LLD doesn't use the erratum sequence in SyntheticSections.
     if (isa<SyntheticSection>(IS))
       continue;
     // Use SectionMap to make sure we only scan code and not inline data.
     // We have already sorted MapSyms in ascending order and removed consecutive
     // mapping symbols of the same type. Our range of executable instructions to
     // scan is therefore [CodeSym->Value, DataSym->Value) or [CodeSym->Value,
     // section size).
     std::vector<const Defined *> &MapSyms = SectionMap[IS];
 
     auto CodeSym = llvm::find_if(MapSyms, [&](const Defined *MS) {
       return MS->getName().startswith("$x");
     });
 
     while (CodeSym != MapSyms.end()) {
       auto DataSym = std::next(CodeSym);
       uint64_t Off = (*CodeSym)->Value;
       uint64_t Limit =
           (DataSym == MapSyms.end()) ? IS->Data.size() : (*DataSym)->Value;
 
       while (Off < Limit) {
         uint64_t StartAddr = IS->getParent()->Addr + IS->OutSecOff + Off;
         if (uint64_t PatcheeOffset = scanCortexA53Errata843419(IS, Off, Limit))
           implementPatch(StartAddr, PatcheeOffset, IS, Patches);
       }
       if (DataSym == MapSyms.end())
         break;
       CodeSym = std::next(DataSym);
     }
   }
   return Patches;
 }
 
 // For each InputSectionDescription make one pass over the executable sections
 // looking for the erratum sequence; creating a synthetic Patch843419Section
 // for each instance found. We insert these synthetic patch sections after the
 // executable code in each InputSectionDescription.
 //
 // PreConditions:
 // The Output and Input Sections have had their final addresses assigned.
 //
 // PostConditions:
 // Returns true if at least one patch was added. The addresses of the
 // Ouptut and Input Sections may have been changed.
 // Returns false if no patches were required and no changes were made.
 bool AArch64Err843419Patcher::createFixes() {
   if (Initialized == false)
     init();
 
   bool AddressesChanged = false;
   for (OutputSection *OS : OutputSections) {
     if (!(OS->Flags & SHF_ALLOC) || !(OS->Flags & SHF_EXECINSTR))
       continue;
     for (BaseCommand *BC : OS->SectionCommands)
       if (auto *ISD = dyn_cast<InputSectionDescription>(BC)) {
         std::vector<Patch843419Section *> Patches =
             patchInputSectionDescription(*ISD);
         if (!Patches.empty()) {
           insertPatches(*ISD, Patches);
           AddressesChanged = true;
         }
       }
   }
   return AddressesChanged;
 }
Index: head/contrib/llvm/tools/lld/ELF/Writer.h
===================================================================
--- head/contrib/llvm/tools/lld/ELF/Writer.h	(revision 328752)
+++ head/contrib/llvm/tools/lld/ELF/Writer.h	(revision 328753)
@@ -1,69 +1,70 @@
 //===- Writer.h -------------------------------------------------*- C++ -*-===//
 //
 //                             The LLVM Linker
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLD_ELF_WRITER_H
 #define LLD_ELF_WRITER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include <cstdint>
 #include <memory>
 
 namespace lld {
 namespace elf {
 class InputFile;
 class OutputSection;
 class InputSectionBase;
 template <class ELFT> class ObjFile;
 class SymbolTable;
 template <class ELFT> void writeResult();
 template <class ELFT> void markLive();
 
 // This describes a program header entry.
 // Each contains type, access flags and range of output sections that will be
 // placed in it.
 struct PhdrEntry {
   PhdrEntry(unsigned Type, unsigned Flags) : p_type(Type), p_flags(Flags) {}
   void add(OutputSection *Sec);
 
   uint64_t p_paddr = 0;
   uint64_t p_vaddr = 0;
   uint64_t p_memsz = 0;
   uint64_t p_filesz = 0;
   uint64_t p_offset = 0;
   uint32_t p_align = 0;
   uint32_t p_type = 0;
   uint32_t p_flags = 0;
 
   OutputSection *FirstSec = nullptr;
   OutputSection *LastSec = nullptr;
   bool HasLMA = false;
 
-  // True if any of the sections in this program header as a LMA specified via
-  // linker script: AT(addr).
+  // True if one of the sections in this program header has a LMA specified via
+  // linker script: AT(addr). We never allow 2 or more sections with LMA in the
+  // same program header.
   bool ASectionHasLMA = false;
 
   uint64_t LMAOffset = 0;
 };
 
 void addReservedSymbols();
 llvm::StringRef getOutputSectionName(InputSectionBase *S);
 
 template <class ELFT> uint32_t calcMipsEFlags();
 
 uint8_t getMipsFpAbiFlag(uint8_t OldFlag, uint8_t NewFlag,
                          llvm::StringRef FileName);
 
 bool isMipsN32Abi(const InputFile *F);
 bool isMicroMips();
 bool isMipsR6();
 } // namespace elf
 } // namespace lld
 
 #endif
Index: head/contrib/llvm/tools/lld
===================================================================
--- head/contrib/llvm/tools/lld	(revision 328752)
+++ head/contrib/llvm/tools/lld	(revision 328753)

Property changes on: head/contrib/llvm/tools/lld
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/lld/dist-release_60:r328374-328749
Index: head/contrib/llvm/tools/lldb
===================================================================
--- head/contrib/llvm/tools/lldb	(revision 328752)
+++ head/contrib/llvm/tools/lldb	(revision 328753)

Property changes on: head/contrib/llvm/tools/lldb
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/lldb/dist-release_60:r328374-328750
Index: head/contrib/llvm
===================================================================
--- head/contrib/llvm	(revision 328752)
+++ head/contrib/llvm	(revision 328753)

Property changes on: head/contrib/llvm
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /vendor/llvm/dist-release_60:r328374-328748
Index: head/lib/clang/include/clang/Basic/Version.inc
===================================================================
--- head/lib/clang/include/clang/Basic/Version.inc	(revision 328752)
+++ head/lib/clang/include/clang/Basic/Version.inc	(revision 328753)
@@ -1,11 +1,11 @@
 /* $FreeBSD$ */
 
 #define	CLANG_VERSION			6.0.0
 #define	CLANG_VERSION_STRING		"6.0.0"
 #define	CLANG_VERSION_MAJOR		6
 #define	CLANG_VERSION_MINOR		0
 #define	CLANG_VERSION_PATCHLEVEL	0
 
 #define	CLANG_VENDOR			"FreeBSD "
 
-#define	SVN_REVISION			"323338"
+#define	SVN_REVISION			"323948"
Index: head/lib/clang/include/lld/Common/Version.inc
===================================================================
--- head/lib/clang/include/lld/Common/Version.inc	(revision 328752)
+++ head/lib/clang/include/lld/Common/Version.inc	(revision 328753)
@@ -1,8 +1,8 @@
 // $FreeBSD$
 
 #define LLD_VERSION 6.0.0
 #define LLD_VERSION_STRING "6.0.0"
 #define LLD_VERSION_MAJOR 6
 #define LLD_VERSION_MINOR 0
-#define LLD_REVISION_STRING "323338"
+#define LLD_REVISION_STRING "323948"
 #define LLD_REPOSITORY_STRING "FreeBSD"
Index: head/lib/clang/include/llvm/Support/VCSRevision.h
===================================================================
--- head/lib/clang/include/llvm/Support/VCSRevision.h	(revision 328752)
+++ head/lib/clang/include/llvm/Support/VCSRevision.h	(revision 328753)
@@ -1,2 +1,2 @@
 /* $FreeBSD$ */
-#define LLVM_REVISION "svn-r323338"
+#define LLVM_REVISION "svn-r323948"