Index: vendor/llvm/dist-release_80/.gitignore
===================================================================
--- vendor/llvm/dist-release_80/.gitignore	(revision 344764)
+++ vendor/llvm/dist-release_80/.gitignore	(revision 344765)
@@ -1,80 +1,82 @@
 #==============================================================================#
 # This file specifies intentionally untracked files that git should ignore.
 # See: http://www.kernel.org/pub/software/scm/git/docs/gitignore.html
 #
 # This file is intentionally different from the output of `git svn show-ignore`,
 # as most of those are useless.
 #==============================================================================#
 
 #==============================================================================#
 # File extensions to be ignored anywhere in the tree.
 #==============================================================================#
 # Temp files created by most text editors.
 *~
 # Merge files created by git.
 *.orig
 # Byte compiled python modules.
 *.pyc
 # vim swap files
 .*.sw?
 .sw?
 #OS X specific files.
 .DS_store
 
 # Nested build directory
 /build
 
 #==============================================================================#
 # Explicit files to ignore (only matches one).
 #==============================================================================#
 # Various tag programs
 /tags
 /TAGS
 /GPATH
 /GRTAGS
 /GSYMS
 /GTAGS
 .gitusers
 autom4te.cache
 cscope.files
 cscope.out
 autoconf/aclocal.m4
 autoconf/autom4te.cache
 /compile_commands.json
 # Visual Studio built-in CMake configuration
 /CMakeSettings.json
 # CLion project configuration
 /.idea
 
 #==============================================================================#
 # Directories to ignore (do not add trailing '/'s, they skip symlinks).
 #==============================================================================#
 # External projects that are tracked independently.
 projects/*
 !projects/*.*
 !projects/Makefile
 runtimes/*
 !runtimes/*.*
 # Clang, which is tracked independently.
 tools/clang
 # LLDB, which is tracked independently.
 tools/lldb
 # lld, which is tracked independently.
 tools/lld
 # llgo, which is tracked independently.
 tools/llgo
 # Polly, which is tracked independently.
 tools/polly
 # avrlit, which is tracked independently.
 tools/avrlit
 # Sphinx build tree, if building in-source dir.
 docs/_build
 # VS2017 and VSCode config files.
 .vscode
 .vs
+# clangd index
+.clangd
 
 #==============================================================================#
 # Files created in tree by the Go bindings.
 #==============================================================================#
 bindings/go/llvm/llvm_config.go
 bindings/go/llvm/workdir
Index: vendor/llvm/dist-release_80/bindings/go/llvm/ir.go
===================================================================
--- vendor/llvm/dist-release_80/bindings/go/llvm/ir.go	(revision 344764)
+++ vendor/llvm/dist-release_80/bindings/go/llvm/ir.go	(revision 344765)
@@ -1,1947 +1,1947 @@
 //===- ir.go - Bindings for ir --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines bindings for the ir component.
 //
 //===----------------------------------------------------------------------===//
 
 package llvm
 
 /*
 #include "llvm-c/Core.h"
 #include "llvm-c/Comdat.h"
 #include "IRBindings.h"
 #include <stdlib.h>
 */
 import "C"
 import "unsafe"
 import "errors"
 
 type (
 	// We use these weird structs here because *Ref types are pointers and
 	// Go's spec says that a pointer cannot be used as a receiver base type.
 	Context struct {
 		C C.LLVMContextRef
 	}
 	Module struct {
 		C C.LLVMModuleRef
 	}
 	Type struct {
 		C C.LLVMTypeRef
 	}
 	Value struct {
 		C C.LLVMValueRef
 	}
 	Comdat struct {
 		C C.LLVMComdatRef
 	}
 	BasicBlock struct {
 		C C.LLVMBasicBlockRef
 	}
 	Builder struct {
 		C C.LLVMBuilderRef
 	}
 	ModuleProvider struct {
 		C C.LLVMModuleProviderRef
 	}
 	MemoryBuffer struct {
 		C C.LLVMMemoryBufferRef
 	}
 	PassManager struct {
 		C C.LLVMPassManagerRef
 	}
 	Use struct {
 		C C.LLVMUseRef
 	}
 	Metadata struct {
 		C C.LLVMMetadataRef
 	}
 	Attribute struct {
 		C C.LLVMAttributeRef
 	}
 	Opcode              C.LLVMOpcode
 	TypeKind            C.LLVMTypeKind
 	Linkage             C.LLVMLinkage
 	Visibility          C.LLVMVisibility
 	CallConv            C.LLVMCallConv
 	ComdatSelectionKind C.LLVMComdatSelectionKind
 	IntPredicate        C.LLVMIntPredicate
 	FloatPredicate      C.LLVMRealPredicate
 	LandingPadClause    C.LLVMLandingPadClauseTy
 	InlineAsmDialect    C.LLVMInlineAsmDialect
 )
 
 func (c Context) IsNil() bool        { return c.C == nil }
 func (c Module) IsNil() bool         { return c.C == nil }
 func (c Type) IsNil() bool           { return c.C == nil }
 func (c Value) IsNil() bool          { return c.C == nil }
 func (c BasicBlock) IsNil() bool     { return c.C == nil }
 func (c Builder) IsNil() bool        { return c.C == nil }
 func (c ModuleProvider) IsNil() bool { return c.C == nil }
 func (c MemoryBuffer) IsNil() bool   { return c.C == nil }
 func (c PassManager) IsNil() bool    { return c.C == nil }
 func (c Use) IsNil() bool            { return c.C == nil }
 func (c Attribute) IsNil() bool      { return c.C == nil }
 
 // helpers
 func llvmTypeRefPtr(t *Type) *C.LLVMTypeRef    { return (*C.LLVMTypeRef)(unsafe.Pointer(t)) }
 func llvmValueRefPtr(t *Value) *C.LLVMValueRef { return (*C.LLVMValueRef)(unsafe.Pointer(t)) }
 func llvmMetadataRefPtr(t *Metadata) *C.LLVMMetadataRef {
 	return (*C.LLVMMetadataRef)(unsafe.Pointer(t))
 }
 func llvmBasicBlockRefPtr(t *BasicBlock) *C.LLVMBasicBlockRef {
 	return (*C.LLVMBasicBlockRef)(unsafe.Pointer(t))
 }
 func boolToLLVMBool(b bool) C.LLVMBool {
 	if b {
 		return C.LLVMBool(1)
 	}
 	return C.LLVMBool(0)
 }
 
 func llvmValueRefs(values []Value) (*C.LLVMValueRef, C.unsigned) {
 	var pt *C.LLVMValueRef
 	ptlen := C.unsigned(len(values))
 	if ptlen > 0 {
 		pt = llvmValueRefPtr(&values[0])
 	}
 	return pt, ptlen
 }
 
 func llvmMetadataRefs(mds []Metadata) (*C.LLVMMetadataRef, C.unsigned) {
 	var pt *C.LLVMMetadataRef
 	ptlen := C.unsigned(len(mds))
 	if ptlen > 0 {
 		pt = llvmMetadataRefPtr(&mds[0])
 	}
 	return pt, ptlen
 }
 
 //-------------------------------------------------------------------------
 // llvm.Opcode
 //-------------------------------------------------------------------------
 
 const (
 	Ret         Opcode = C.LLVMRet
 	Br          Opcode = C.LLVMBr
 	Switch      Opcode = C.LLVMSwitch
 	IndirectBr  Opcode = C.LLVMIndirectBr
 	Invoke      Opcode = C.LLVMInvoke
 	Unreachable Opcode = C.LLVMUnreachable
 
 	// Standard Binary Operators
 	Add  Opcode = C.LLVMAdd
 	FAdd Opcode = C.LLVMFAdd
 	Sub  Opcode = C.LLVMSub
 	FSub Opcode = C.LLVMFSub
 	Mul  Opcode = C.LLVMMul
 	FMul Opcode = C.LLVMFMul
 	UDiv Opcode = C.LLVMUDiv
 	SDiv Opcode = C.LLVMSDiv
 	FDiv Opcode = C.LLVMFDiv
 	URem Opcode = C.LLVMURem
 	SRem Opcode = C.LLVMSRem
 	FRem Opcode = C.LLVMFRem
 
 	// Logical Operators
 	Shl  Opcode = C.LLVMShl
 	LShr Opcode = C.LLVMLShr
 	AShr Opcode = C.LLVMAShr
 	And  Opcode = C.LLVMAnd
 	Or   Opcode = C.LLVMOr
 	Xor  Opcode = C.LLVMXor
 
 	// Memory Operators
 	Alloca        Opcode = C.LLVMAlloca
 	Load          Opcode = C.LLVMLoad
 	Store         Opcode = C.LLVMStore
 	GetElementPtr Opcode = C.LLVMGetElementPtr
 
 	// Cast Operators
 	Trunc    Opcode = C.LLVMTrunc
 	ZExt     Opcode = C.LLVMZExt
 	SExt     Opcode = C.LLVMSExt
 	FPToUI   Opcode = C.LLVMFPToUI
 	FPToSI   Opcode = C.LLVMFPToSI
 	UIToFP   Opcode = C.LLVMUIToFP
 	SIToFP   Opcode = C.LLVMSIToFP
 	FPTrunc  Opcode = C.LLVMFPTrunc
 	FPExt    Opcode = C.LLVMFPExt
 	PtrToInt Opcode = C.LLVMPtrToInt
 	IntToPtr Opcode = C.LLVMIntToPtr
 	BitCast  Opcode = C.LLVMBitCast
 
 	// Other Operators
 	ICmp   Opcode = C.LLVMICmp
 	FCmp   Opcode = C.LLVMFCmp
 	PHI    Opcode = C.LLVMPHI
 	Call   Opcode = C.LLVMCall
 	Select Opcode = C.LLVMSelect
 	// UserOp1
 	// UserOp2
 	VAArg          Opcode = C.LLVMVAArg
 	ExtractElement Opcode = C.LLVMExtractElement
 	InsertElement  Opcode = C.LLVMInsertElement
 	ShuffleVector  Opcode = C.LLVMShuffleVector
 	ExtractValue   Opcode = C.LLVMExtractValue
 	InsertValue    Opcode = C.LLVMInsertValue
 )
 
 //-------------------------------------------------------------------------
 // llvm.TypeKind
 //-------------------------------------------------------------------------
 
 const (
 	VoidTypeKind      TypeKind = C.LLVMVoidTypeKind
 	FloatTypeKind     TypeKind = C.LLVMFloatTypeKind
 	DoubleTypeKind    TypeKind = C.LLVMDoubleTypeKind
 	X86_FP80TypeKind  TypeKind = C.LLVMX86_FP80TypeKind
 	FP128TypeKind     TypeKind = C.LLVMFP128TypeKind
 	PPC_FP128TypeKind TypeKind = C.LLVMPPC_FP128TypeKind
 	LabelTypeKind     TypeKind = C.LLVMLabelTypeKind
 	IntegerTypeKind   TypeKind = C.LLVMIntegerTypeKind
 	FunctionTypeKind  TypeKind = C.LLVMFunctionTypeKind
 	StructTypeKind    TypeKind = C.LLVMStructTypeKind
 	ArrayTypeKind     TypeKind = C.LLVMArrayTypeKind
 	PointerTypeKind   TypeKind = C.LLVMPointerTypeKind
 	VectorTypeKind    TypeKind = C.LLVMVectorTypeKind
 	MetadataTypeKind  TypeKind = C.LLVMMetadataTypeKind
 	TokenTypeKind     TypeKind = C.LLVMTokenTypeKind
 )
 
 //-------------------------------------------------------------------------
 // llvm.Linkage
 //-------------------------------------------------------------------------
 
 const (
 	ExternalLinkage            Linkage = C.LLVMExternalLinkage
 	AvailableExternallyLinkage Linkage = C.LLVMAvailableExternallyLinkage
 	LinkOnceAnyLinkage         Linkage = C.LLVMLinkOnceAnyLinkage
 	LinkOnceODRLinkage         Linkage = C.LLVMLinkOnceODRLinkage
 	WeakAnyLinkage             Linkage = C.LLVMWeakAnyLinkage
 	WeakODRLinkage             Linkage = C.LLVMWeakODRLinkage
 	AppendingLinkage           Linkage = C.LLVMAppendingLinkage
 	InternalLinkage            Linkage = C.LLVMInternalLinkage
 	PrivateLinkage             Linkage = C.LLVMPrivateLinkage
 	ExternalWeakLinkage        Linkage = C.LLVMExternalWeakLinkage
 	CommonLinkage              Linkage = C.LLVMCommonLinkage
 )
 
 //-------------------------------------------------------------------------
 // llvm.Visibility
 //-------------------------------------------------------------------------
 
 const (
 	DefaultVisibility   Visibility = C.LLVMDefaultVisibility
 	HiddenVisibility    Visibility = C.LLVMHiddenVisibility
 	ProtectedVisibility Visibility = C.LLVMProtectedVisibility
 )
 
 //-------------------------------------------------------------------------
 // llvm.CallConv
 //-------------------------------------------------------------------------
 
 const (
 	CCallConv           CallConv = C.LLVMCCallConv
 	FastCallConv        CallConv = C.LLVMFastCallConv
 	ColdCallConv        CallConv = C.LLVMColdCallConv
 	X86StdcallCallConv  CallConv = C.LLVMX86StdcallCallConv
 	X86FastcallCallConv CallConv = C.LLVMX86FastcallCallConv
 )
 
 //-------------------------------------------------------------------------
 // llvm.ComdatSelectionKind
 //-------------------------------------------------------------------------
 
 const (
 	AnyComdatSelectionKind          ComdatSelectionKind = C.LLVMAnyComdatSelectionKind
 	ExactMatchComdatSelectionKind   ComdatSelectionKind = C.LLVMExactMatchComdatSelectionKind
 	LargestComdatSelectionKind      ComdatSelectionKind = C.LLVMLargestComdatSelectionKind
 	NoDuplicatesComdatSelectionKind ComdatSelectionKind = C.LLVMNoDuplicatesComdatSelectionKind
 	SameSizeComdatSelectionKind     ComdatSelectionKind = C.LLVMSameSizeComdatSelectionKind
 )
 
 //-------------------------------------------------------------------------
 // llvm.IntPredicate
 //-------------------------------------------------------------------------
 
 const (
 	IntEQ  IntPredicate = C.LLVMIntEQ
 	IntNE  IntPredicate = C.LLVMIntNE
 	IntUGT IntPredicate = C.LLVMIntUGT
 	IntUGE IntPredicate = C.LLVMIntUGE
 	IntULT IntPredicate = C.LLVMIntULT
 	IntULE IntPredicate = C.LLVMIntULE
 	IntSGT IntPredicate = C.LLVMIntSGT
 	IntSGE IntPredicate = C.LLVMIntSGE
 	IntSLT IntPredicate = C.LLVMIntSLT
 	IntSLE IntPredicate = C.LLVMIntSLE
 )
 
 //-------------------------------------------------------------------------
 // llvm.FloatPredicate
 //-------------------------------------------------------------------------
 
 const (
 	FloatPredicateFalse FloatPredicate = C.LLVMRealPredicateFalse
 	FloatOEQ            FloatPredicate = C.LLVMRealOEQ
 	FloatOGT            FloatPredicate = C.LLVMRealOGT
 	FloatOGE            FloatPredicate = C.LLVMRealOGE
 	FloatOLT            FloatPredicate = C.LLVMRealOLT
 	FloatOLE            FloatPredicate = C.LLVMRealOLE
 	FloatONE            FloatPredicate = C.LLVMRealONE
 	FloatORD            FloatPredicate = C.LLVMRealORD
 	FloatUNO            FloatPredicate = C.LLVMRealUNO
 	FloatUEQ            FloatPredicate = C.LLVMRealUEQ
 	FloatUGT            FloatPredicate = C.LLVMRealUGT
 	FloatUGE            FloatPredicate = C.LLVMRealUGE
 	FloatULT            FloatPredicate = C.LLVMRealULT
 	FloatULE            FloatPredicate = C.LLVMRealULE
 	FloatUNE            FloatPredicate = C.LLVMRealUNE
 	FloatPredicateTrue  FloatPredicate = C.LLVMRealPredicateTrue
 )
 
 //-------------------------------------------------------------------------
 // llvm.LandingPadClause
 //-------------------------------------------------------------------------
 
 const (
 	LandingPadCatch  LandingPadClause = C.LLVMLandingPadCatch
 	LandingPadFilter LandingPadClause = C.LLVMLandingPadFilter
 )
 
 //-------------------------------------------------------------------------
 // llvm.InlineAsmDialect
 //-------------------------------------------------------------------------
 
 const (
 	InlineAsmDialectATT   InlineAsmDialect = C.LLVMInlineAsmDialectATT
 	InlineAsmDialectIntel InlineAsmDialect = C.LLVMInlineAsmDialectIntel
 )
 
 //-------------------------------------------------------------------------
 // llvm.Context
 //-------------------------------------------------------------------------
 
 func NewContext() Context    { return Context{C.LLVMContextCreate()} }
 func GlobalContext() Context { return Context{C.LLVMGetGlobalContext()} }
 func (c Context) Dispose()   { C.LLVMContextDispose(c.C) }
 
 func (c Context) MDKindID(name string) (id int) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	id = int(C.LLVMGetMDKindIDInContext(c.C, cname, C.unsigned(len(name))))
 	return
 }
 
 func MDKindID(name string) (id int) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	id = int(C.LLVMGetMDKindID(cname, C.unsigned(len(name))))
 	return
 }
 
 //-------------------------------------------------------------------------
 // llvm.Attribute
 //-------------------------------------------------------------------------
 
 func AttributeKindID(name string) (id uint) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	id = uint(C.LLVMGetEnumAttributeKindForName(cname, C.size_t(len(name))))
 	return
 }
 
 func (c Context) CreateEnumAttribute(kind uint, val uint64) (a Attribute) {
   a.C = C.LLVMCreateEnumAttribute(c.C, C.unsigned(kind), C.uint64_t(val))
   return
 }
 
 func (a Attribute) GetEnumKind() (id int) {
   id = int(C.LLVMGetEnumAttributeKind(a.C))
   return
 }
 
 func (a Attribute) GetEnumValue() (val uint64) {
   val = uint64(C.LLVMGetEnumAttributeValue(a.C))
   return
 }
 
 func (c Context) CreateStringAttribute(kind string, val string) (a Attribute) {
   ckind := C.CString(kind)
   defer C.free(unsafe.Pointer(ckind))
   cval := C.CString(val)
   defer C.free(unsafe.Pointer(cval))
   a.C = C.LLVMCreateStringAttribute(c.C,
                                     ckind, C.unsigned(len(kind)),
                                     cval, C.unsigned(len(val)))
   return
 }
 
 func (a Attribute) GetStringKind() string {
   length := C.unsigned(0)
   ckind := C.LLVMGetStringAttributeKind(a.C, &length)
   return C.GoStringN(ckind, C.int(length))
 }
 
 func (a Attribute) GetStringValue() string {
   length := C.unsigned(0)
   ckind := C.LLVMGetStringAttributeValue(a.C, &length)
   return C.GoStringN(ckind, C.int(length))
 }
 
 func (a Attribute) IsEnum() bool {
   return C.LLVMIsEnumAttribute(a.C) != 0;
 }
 
 func (a Attribute) IsString() bool {
   return C.LLVMIsStringAttribute(a.C) != 0;
 }
 
 //-------------------------------------------------------------------------
 // llvm.Module
 //-------------------------------------------------------------------------
 
 // Create and destroy modules.
 // See llvm::Module::Module.
 func NewModule(name string) (m Module) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	m.C = C.LLVMModuleCreateWithName(cname)
 	return
 }
 
 func (c Context) NewModule(name string) (m Module) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	m.C = C.LLVMModuleCreateWithNameInContext(cname, c.C)
 	return
 }
 
 // See llvm::Module::~Module
 func (m Module) Dispose() { C.LLVMDisposeModule(m.C) }
 
 // Data layout. See Module::getDataLayout.
 func (m Module) DataLayout() string {
 	clayout := C.LLVMGetDataLayout(m.C)
 	return C.GoString(clayout)
 }
 
 func (m Module) SetDataLayout(layout string) {
 	clayout := C.CString(layout)
 	defer C.free(unsafe.Pointer(clayout))
 	C.LLVMSetDataLayout(m.C, clayout)
 }
 
 // Target triple. See Module::getTargetTriple.
 func (m Module) Target() string {
 	ctarget := C.LLVMGetTarget(m.C)
 	return C.GoString(ctarget)
 }
 func (m Module) SetTarget(target string) {
 	ctarget := C.CString(target)
 	defer C.free(unsafe.Pointer(ctarget))
 	C.LLVMSetTarget(m.C, ctarget)
 }
 
 func (m Module) GetTypeByName(name string) (t Type) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	t.C = C.LLVMGetTypeByName(m.C, cname)
 	return
 }
 
 // See Module::dump.
 func (m Module) Dump() {
 	C.LLVMDumpModule(m.C)
 }
 
 func (m Module) String() string {
 	cir := C.LLVMPrintModuleToString(m.C)
 	defer C.free(unsafe.Pointer(cir))
 	ir := C.GoString(cir)
 	return ir
 }
 
 // See Module::setModuleInlineAsm.
 func (m Module) SetInlineAsm(asm string) {
 	casm := C.CString(asm)
 	defer C.free(unsafe.Pointer(casm))
 	C.LLVMSetModuleInlineAsm(m.C, casm)
 }
 
 func (m Module) AddNamedMetadataOperand(name string, operand Metadata) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	C.LLVMAddNamedMetadataOperand2(m.C, cname, operand.C)
 }
 
 func (m Module) Context() (c Context) {
 	c.C = C.LLVMGetModuleContext(m.C)
 	return
 }
 
 //-------------------------------------------------------------------------
 // llvm.Type
 //-------------------------------------------------------------------------
 
 // LLVM types conform to the following hierarchy:
 //
 //   types:
 //     integer type
 //     real type
 //     function type
 //     sequence types:
 //       array type
 //       pointer type
 //       vector type
 //     void type
 //     label type
 //     opaque type
 
 // See llvm::LLVMTypeKind::getTypeID.
 func (t Type) TypeKind() TypeKind { return TypeKind(C.LLVMGetTypeKind(t.C)) }
 
 // See llvm::LLVMType::getContext.
 func (t Type) Context() (c Context) {
 	c.C = C.LLVMGetTypeContext(t.C)
 	return
 }
 
 // Operations on integer types
 func (c Context) Int1Type() (t Type)  { t.C = C.LLVMInt1TypeInContext(c.C); return }
 func (c Context) Int8Type() (t Type)  { t.C = C.LLVMInt8TypeInContext(c.C); return }
 func (c Context) Int16Type() (t Type) { t.C = C.LLVMInt16TypeInContext(c.C); return }
 func (c Context) Int32Type() (t Type) { t.C = C.LLVMInt32TypeInContext(c.C); return }
 func (c Context) Int64Type() (t Type) { t.C = C.LLVMInt64TypeInContext(c.C); return }
 func (c Context) IntType(numbits int) (t Type) {
 	t.C = C.LLVMIntTypeInContext(c.C, C.unsigned(numbits))
 	return
 }
 
 func Int1Type() (t Type)  { t.C = C.LLVMInt1Type(); return }
 func Int8Type() (t Type)  { t.C = C.LLVMInt8Type(); return }
 func Int16Type() (t Type) { t.C = C.LLVMInt16Type(); return }
 func Int32Type() (t Type) { t.C = C.LLVMInt32Type(); return }
 func Int64Type() (t Type) { t.C = C.LLVMInt64Type(); return }
 
 func IntType(numbits int) (t Type) {
 	t.C = C.LLVMIntType(C.unsigned(numbits))
 	return
 }
 
 func (t Type) IntTypeWidth() int {
 	return int(C.LLVMGetIntTypeWidth(t.C))
 }
 
 // Operations on real types
 func (c Context) FloatType() (t Type)    { t.C = C.LLVMFloatTypeInContext(c.C); return }
 func (c Context) DoubleType() (t Type)   { t.C = C.LLVMDoubleTypeInContext(c.C); return }
 func (c Context) X86FP80Type() (t Type)  { t.C = C.LLVMX86FP80TypeInContext(c.C); return }
 func (c Context) FP128Type() (t Type)    { t.C = C.LLVMFP128TypeInContext(c.C); return }
 func (c Context) PPCFP128Type() (t Type) { t.C = C.LLVMPPCFP128TypeInContext(c.C); return }
 
 func FloatType() (t Type)    { t.C = C.LLVMFloatType(); return }
 func DoubleType() (t Type)   { t.C = C.LLVMDoubleType(); return }
 func X86FP80Type() (t Type)  { t.C = C.LLVMX86FP80Type(); return }
 func FP128Type() (t Type)    { t.C = C.LLVMFP128Type(); return }
 func PPCFP128Type() (t Type) { t.C = C.LLVMPPCFP128Type(); return }
 
 // Operations on function types
 func FunctionType(returnType Type, paramTypes []Type, isVarArg bool) (t Type) {
 	var pt *C.LLVMTypeRef
 	var ptlen C.unsigned
 	if len(paramTypes) > 0 {
 		pt = llvmTypeRefPtr(&paramTypes[0])
 		ptlen = C.unsigned(len(paramTypes))
 	}
 	t.C = C.LLVMFunctionType(returnType.C,
 		pt,
 		ptlen,
 		boolToLLVMBool(isVarArg))
 	return
 }
 
 func (t Type) IsFunctionVarArg() bool { return C.LLVMIsFunctionVarArg(t.C) != 0 }
 func (t Type) ReturnType() (rt Type)  { rt.C = C.LLVMGetReturnType(t.C); return }
 func (t Type) ParamTypesCount() int   { return int(C.LLVMCountParamTypes(t.C)) }
 func (t Type) ParamTypes() []Type {
 	count := t.ParamTypesCount()
 	if count > 0 {
 		out := make([]Type, count)
 		C.LLVMGetParamTypes(t.C, llvmTypeRefPtr(&out[0]))
 		return out
 	}
 	return nil
 }
 
 // Operations on struct types
 func (c Context) StructType(elementTypes []Type, packed bool) (t Type) {
 	var pt *C.LLVMTypeRef
 	var ptlen C.unsigned
 	if len(elementTypes) > 0 {
 		pt = llvmTypeRefPtr(&elementTypes[0])
 		ptlen = C.unsigned(len(elementTypes))
 	}
 	t.C = C.LLVMStructTypeInContext(c.C,
 		pt,
 		ptlen,
 		boolToLLVMBool(packed))
 	return
 }
 
 func StructType(elementTypes []Type, packed bool) (t Type) {
 	var pt *C.LLVMTypeRef
 	var ptlen C.unsigned
 	if len(elementTypes) > 0 {
 		pt = llvmTypeRefPtr(&elementTypes[0])
 		ptlen = C.unsigned(len(elementTypes))
 	}
 	t.C = C.LLVMStructType(pt, ptlen, boolToLLVMBool(packed))
 	return
 }
 
 func (c Context) StructCreateNamed(name string) (t Type) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	t.C = C.LLVMStructCreateNamed(c.C, cname)
 	return
 }
 
 func (t Type) StructName() string {
 	return C.GoString(C.LLVMGetStructName(t.C))
 }
 
 func (t Type) StructSetBody(elementTypes []Type, packed bool) {
 	var pt *C.LLVMTypeRef
 	var ptlen C.unsigned
 	if len(elementTypes) > 0 {
 		pt = llvmTypeRefPtr(&elementTypes[0])
 		ptlen = C.unsigned(len(elementTypes))
 	}
 	C.LLVMStructSetBody(t.C, pt, ptlen, boolToLLVMBool(packed))
 }
 
 func (t Type) IsStructPacked() bool         { return C.LLVMIsPackedStruct(t.C) != 0 }
 func (t Type) StructElementTypesCount() int { return int(C.LLVMCountStructElementTypes(t.C)) }
 func (t Type) StructElementTypes() []Type {
 	out := make([]Type, t.StructElementTypesCount())
 	if len(out) > 0 {
 		C.LLVMGetStructElementTypes(t.C, llvmTypeRefPtr(&out[0]))
 	}
 	return out
 }
 
 // Operations on array, pointer, and vector types (sequence types)
 func (t Type) Subtypes() (ret []Type) {
 	ret = make([]Type, C.LLVMGetNumContainedTypes(t.C))
 	C.LLVMGetSubtypes(t.C, llvmTypeRefPtr(&ret[0]))
 	return
 }
 
 func ArrayType(elementType Type, elementCount int) (t Type) {
 	t.C = C.LLVMArrayType(elementType.C, C.unsigned(elementCount))
 	return
 }
 func PointerType(elementType Type, addressSpace int) (t Type) {
 	t.C = C.LLVMPointerType(elementType.C, C.unsigned(addressSpace))
 	return
 }
 func VectorType(elementType Type, elementCount int) (t Type) {
 	t.C = C.LLVMVectorType(elementType.C, C.unsigned(elementCount))
 	return
 }
 
 func (t Type) ElementType() (rt Type)   { rt.C = C.LLVMGetElementType(t.C); return }
 func (t Type) ArrayLength() int         { return int(C.LLVMGetArrayLength(t.C)) }
 func (t Type) PointerAddressSpace() int { return int(C.LLVMGetPointerAddressSpace(t.C)) }
 func (t Type) VectorSize() int          { return int(C.LLVMGetVectorSize(t.C)) }
 
 // Operations on other types
 func (c Context) VoidType() (t Type)  { t.C = C.LLVMVoidTypeInContext(c.C); return }
 func (c Context) LabelType() (t Type) { t.C = C.LLVMLabelTypeInContext(c.C); return }
 func (c Context) TokenType() (t Type) { t.C = C.LLVMTokenTypeInContext(c.C); return }
 
 func VoidType() (t Type)  { t.C = C.LLVMVoidType(); return }
 func LabelType() (t Type) { t.C = C.LLVMLabelType(); return }
 
 //-------------------------------------------------------------------------
 // llvm.Value
 //-------------------------------------------------------------------------
 
 // Operations on all values
 func (v Value) Type() (t Type) { t.C = C.LLVMTypeOf(v.C); return }
 func (v Value) Name() string   { return C.GoString(C.LLVMGetValueName(v.C)) }
 func (v Value) SetName(name string) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	C.LLVMSetValueName(v.C, cname)
 }
 func (v Value) Dump()                       { C.LLVMDumpValue(v.C) }
 func (v Value) ReplaceAllUsesWith(nv Value) { C.LLVMReplaceAllUsesWith(v.C, nv.C) }
 func (v Value) HasMetadata() bool           { return C.LLVMHasMetadata(v.C) != 0 }
 func (v Value) Metadata(kind int) (rv Value) {
 	rv.C = C.LLVMGetMetadata(v.C, C.unsigned(kind))
 	return
 }
 func (v Value) SetMetadata(kind int, node Metadata) {
 	C.LLVMSetMetadata2(v.C, C.unsigned(kind), node.C)
 }
 
 // Conversion functions.
 // Return the input value if it is an instance of the specified class, otherwise NULL.
 // See llvm::dyn_cast_or_null<>.
 func (v Value) IsAArgument() (rv Value)   { rv.C = C.LLVMIsAArgument(v.C); return }
 func (v Value) IsABasicBlock() (rv Value) { rv.C = C.LLVMIsABasicBlock(v.C); return }
 func (v Value) IsAInlineAsm() (rv Value)  { rv.C = C.LLVMIsAInlineAsm(v.C); return }
 func (v Value) IsAUser() (rv Value)       { rv.C = C.LLVMIsAUser(v.C); return }
 func (v Value) IsAConstant() (rv Value)   { rv.C = C.LLVMIsAConstant(v.C); return }
 func (v Value) IsAConstantAggregateZero() (rv Value) {
 	rv.C = C.LLVMIsAConstantAggregateZero(v.C)
 	return
 }
 func (v Value) IsAConstantArray() (rv Value)       { rv.C = C.LLVMIsAConstantArray(v.C); return }
 func (v Value) IsAConstantExpr() (rv Value)        { rv.C = C.LLVMIsAConstantExpr(v.C); return }
 func (v Value) IsAConstantFP() (rv Value)          { rv.C = C.LLVMIsAConstantFP(v.C); return }
 func (v Value) IsAConstantInt() (rv Value)         { rv.C = C.LLVMIsAConstantInt(v.C); return }
 func (v Value) IsAConstantPointerNull() (rv Value) { rv.C = C.LLVMIsAConstantPointerNull(v.C); return }
 func (v Value) IsAConstantStruct() (rv Value)      { rv.C = C.LLVMIsAConstantStruct(v.C); return }
 func (v Value) IsAConstantVector() (rv Value)      { rv.C = C.LLVMIsAConstantVector(v.C); return }
 func (v Value) IsAGlobalValue() (rv Value)         { rv.C = C.LLVMIsAGlobalValue(v.C); return }
 func (v Value) IsAFunction() (rv Value)            { rv.C = C.LLVMIsAFunction(v.C); return }
 func (v Value) IsAGlobalAlias() (rv Value)         { rv.C = C.LLVMIsAGlobalAlias(v.C); return }
 func (v Value) IsAGlobalVariable() (rv Value)      { rv.C = C.LLVMIsAGlobalVariable(v.C); return }
 func (v Value) IsAUndefValue() (rv Value)          { rv.C = C.LLVMIsAUndefValue(v.C); return }
 func (v Value) IsAInstruction() (rv Value)         { rv.C = C.LLVMIsAInstruction(v.C); return }
 func (v Value) IsABinaryOperator() (rv Value)      { rv.C = C.LLVMIsABinaryOperator(v.C); return }
 func (v Value) IsACallInst() (rv Value)            { rv.C = C.LLVMIsACallInst(v.C); return }
 func (v Value) IsAIntrinsicInst() (rv Value)       { rv.C = C.LLVMIsAIntrinsicInst(v.C); return }
 func (v Value) IsADbgInfoIntrinsic() (rv Value)    { rv.C = C.LLVMIsADbgInfoIntrinsic(v.C); return }
 func (v Value) IsADbgDeclareInst() (rv Value)      { rv.C = C.LLVMIsADbgDeclareInst(v.C); return }
 func (v Value) IsAMemIntrinsic() (rv Value)        { rv.C = C.LLVMIsAMemIntrinsic(v.C); return }
 func (v Value) IsAMemCpyInst() (rv Value)          { rv.C = C.LLVMIsAMemCpyInst(v.C); return }
 func (v Value) IsAMemMoveInst() (rv Value)         { rv.C = C.LLVMIsAMemMoveInst(v.C); return }
 func (v Value) IsAMemSetInst() (rv Value)          { rv.C = C.LLVMIsAMemSetInst(v.C); return }
 func (v Value) IsACmpInst() (rv Value)             { rv.C = C.LLVMIsACmpInst(v.C); return }
 func (v Value) IsAFCmpInst() (rv Value)            { rv.C = C.LLVMIsAFCmpInst(v.C); return }
 func (v Value) IsAICmpInst() (rv Value)            { rv.C = C.LLVMIsAICmpInst(v.C); return }
 func (v Value) IsAExtractElementInst() (rv Value)  { rv.C = C.LLVMIsAExtractElementInst(v.C); return }
 func (v Value) IsAGetElementPtrInst() (rv Value)   { rv.C = C.LLVMIsAGetElementPtrInst(v.C); return }
 func (v Value) IsAInsertElementInst() (rv Value)   { rv.C = C.LLVMIsAInsertElementInst(v.C); return }
 func (v Value) IsAInsertValueInst() (rv Value)     { rv.C = C.LLVMIsAInsertValueInst(v.C); return }
 func (v Value) IsAPHINode() (rv Value)             { rv.C = C.LLVMIsAPHINode(v.C); return }
 func (v Value) IsASelectInst() (rv Value)          { rv.C = C.LLVMIsASelectInst(v.C); return }
 func (v Value) IsAShuffleVectorInst() (rv Value)   { rv.C = C.LLVMIsAShuffleVectorInst(v.C); return }
 func (v Value) IsAStoreInst() (rv Value)           { rv.C = C.LLVMIsAStoreInst(v.C); return }
 func (v Value) IsABranchInst() (rv Value)          { rv.C = C.LLVMIsABranchInst(v.C); return }
 func (v Value) IsAInvokeInst() (rv Value)          { rv.C = C.LLVMIsAInvokeInst(v.C); return }
 func (v Value) IsAReturnInst() (rv Value)          { rv.C = C.LLVMIsAReturnInst(v.C); return }
 func (v Value) IsASwitchInst() (rv Value)          { rv.C = C.LLVMIsASwitchInst(v.C); return }
 func (v Value) IsAUnreachableInst() (rv Value)     { rv.C = C.LLVMIsAUnreachableInst(v.C); return }
 func (v Value) IsAUnaryInstruction() (rv Value)    { rv.C = C.LLVMIsAUnaryInstruction(v.C); return }
 func (v Value) IsAAllocaInst() (rv Value)          { rv.C = C.LLVMIsAAllocaInst(v.C); return }
 func (v Value) IsACastInst() (rv Value)            { rv.C = C.LLVMIsACastInst(v.C); return }
 func (v Value) IsABitCastInst() (rv Value)         { rv.C = C.LLVMIsABitCastInst(v.C); return }
 func (v Value) IsAFPExtInst() (rv Value)           { rv.C = C.LLVMIsAFPExtInst(v.C); return }
 func (v Value) IsAFPToSIInst() (rv Value)          { rv.C = C.LLVMIsAFPToSIInst(v.C); return }
 func (v Value) IsAFPToUIInst() (rv Value)          { rv.C = C.LLVMIsAFPToUIInst(v.C); return }
 func (v Value) IsAFPTruncInst() (rv Value)         { rv.C = C.LLVMIsAFPTruncInst(v.C); return }
 func (v Value) IsAIntToPtrInst() (rv Value)        { rv.C = C.LLVMIsAIntToPtrInst(v.C); return }
 func (v Value) IsAPtrToIntInst() (rv Value)        { rv.C = C.LLVMIsAPtrToIntInst(v.C); return }
 func (v Value) IsASExtInst() (rv Value)            { rv.C = C.LLVMIsASExtInst(v.C); return }
 func (v Value) IsASIToFPInst() (rv Value)          { rv.C = C.LLVMIsASIToFPInst(v.C); return }
 func (v Value) IsATruncInst() (rv Value)           { rv.C = C.LLVMIsATruncInst(v.C); return }
 func (v Value) IsAUIToFPInst() (rv Value)          { rv.C = C.LLVMIsAUIToFPInst(v.C); return }
 func (v Value) IsAZExtInst() (rv Value)            { rv.C = C.LLVMIsAZExtInst(v.C); return }
 func (v Value) IsAExtractValueInst() (rv Value)    { rv.C = C.LLVMIsAExtractValueInst(v.C); return }
 func (v Value) IsALoadInst() (rv Value)            { rv.C = C.LLVMIsALoadInst(v.C); return }
 func (v Value) IsAVAArgInst() (rv Value)           { rv.C = C.LLVMIsAVAArgInst(v.C); return }
 
 // Operations on Uses
 func (v Value) FirstUse() (u Use)  { u.C = C.LLVMGetFirstUse(v.C); return }
 func (u Use) NextUse() (ru Use)    { ru.C = C.LLVMGetNextUse(u.C); return }
 func (u Use) User() (v Value)      { v.C = C.LLVMGetUser(u.C); return }
 func (u Use) UsedValue() (v Value) { v.C = C.LLVMGetUsedValue(u.C); return }
 
 // Operations on Users
 func (v Value) Operand(i int) (rv Value)   { rv.C = C.LLVMGetOperand(v.C, C.unsigned(i)); return }
 func (v Value) SetOperand(i int, op Value) { C.LLVMSetOperand(v.C, C.unsigned(i), op.C) }
 func (v Value) OperandsCount() int         { return int(C.LLVMGetNumOperands(v.C)) }
 
 // Operations on constants of any type
 func ConstNull(t Type) (v Value)        { v.C = C.LLVMConstNull(t.C); return }
 func ConstAllOnes(t Type) (v Value)     { v.C = C.LLVMConstAllOnes(t.C); return }
 func Undef(t Type) (v Value)            { v.C = C.LLVMGetUndef(t.C); return }
 func (v Value) IsConstant() bool        { return C.LLVMIsConstant(v.C) != 0 }
 func (v Value) IsNull() bool            { return C.LLVMIsNull(v.C) != 0 }
 func (v Value) IsUndef() bool           { return C.LLVMIsUndef(v.C) != 0 }
 func ConstPointerNull(t Type) (v Value) { v.C = C.LLVMConstPointerNull(t.C); return }
 
 // Operations on metadata
 func (c Context) MDString(str string) (md Metadata) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
 	md.C = C.LLVMMDString2(c.C, cstr, C.unsigned(len(str)))
 	return
 }
 func (c Context) MDNode(mds []Metadata) (md Metadata) {
 	ptr, nvals := llvmMetadataRefs(mds)
 	md.C = C.LLVMMDNode2(c.C, ptr, nvals)
 	return
 }
 func (v Value) ConstantAsMetadata() (md Metadata) {
 	md.C = C.LLVMConstantAsMetadata(v.C)
 	return
 }
 
 // Operations on scalar constants
 func ConstInt(t Type, n uint64, signExtend bool) (v Value) {
 	v.C = C.LLVMConstInt(t.C,
 		C.ulonglong(n),
 		boolToLLVMBool(signExtend))
 	return
 }
 func ConstIntFromString(t Type, str string, radix int) (v Value) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
 	v.C = C.LLVMConstIntOfString(t.C, cstr, C.uint8_t(radix))
 	return
 }
 func ConstFloat(t Type, n float64) (v Value) {
 	v.C = C.LLVMConstReal(t.C, C.double(n))
 	return
 }
 func ConstFloatFromString(t Type, str string) (v Value) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
 	v.C = C.LLVMConstRealOfString(t.C, cstr)
 	return
 }
 
 func (v Value) ZExtValue() uint64 { return uint64(C.LLVMConstIntGetZExtValue(v.C)) }
 func (v Value) SExtValue() int64  { return int64(C.LLVMConstIntGetSExtValue(v.C)) }
 
 // Operations on composite constants
 func (c Context) ConstString(str string, addnull bool) (v Value) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
 	v.C = C.LLVMConstStringInContext(c.C, cstr,
 		C.unsigned(len(str)), boolToLLVMBool(!addnull))
 	return
 }
 func (c Context) ConstStruct(constVals []Value, packed bool) (v Value) {
 	ptr, nvals := llvmValueRefs(constVals)
 	v.C = C.LLVMConstStructInContext(c.C, ptr, nvals,
 		boolToLLVMBool(packed))
 	return
 }
 func ConstNamedStruct(t Type, constVals []Value) (v Value) {
 	ptr, nvals := llvmValueRefs(constVals)
 	v.C = C.LLVMConstNamedStruct(t.C, ptr, nvals)
 	return
 }
 func ConstString(str string, addnull bool) (v Value) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
 	v.C = C.LLVMConstString(cstr,
 		C.unsigned(len(str)), boolToLLVMBool(!addnull))
 	return
 }
 func ConstArray(t Type, constVals []Value) (v Value) {
 	ptr, nvals := llvmValueRefs(constVals)
 	v.C = C.LLVMConstArray(t.C, ptr, nvals)
 	return
 }
 func ConstStruct(constVals []Value, packed bool) (v Value) {
 	ptr, nvals := llvmValueRefs(constVals)
 	v.C = C.LLVMConstStruct(ptr, nvals, boolToLLVMBool(packed))
 	return
 }
 func ConstVector(scalarConstVals []Value, packed bool) (v Value) {
 	ptr, nvals := llvmValueRefs(scalarConstVals)
 	v.C = C.LLVMConstVector(ptr, nvals)
 	return
 }
 
 // Constant expressions
 func (v Value) Opcode() Opcode                { return Opcode(C.LLVMGetConstOpcode(v.C)) }
 func (v Value) InstructionOpcode() Opcode     { return Opcode(C.LLVMGetInstructionOpcode(v.C)) }
 func AlignOf(t Type) (v Value)                { v.C = C.LLVMAlignOf(t.C); return }
 func SizeOf(t Type) (v Value)                 { v.C = C.LLVMSizeOf(t.C); return }
 func ConstNeg(v Value) (rv Value)             { rv.C = C.LLVMConstNeg(v.C); return }
 func ConstNSWNeg(v Value) (rv Value)          { rv.C = C.LLVMConstNSWNeg(v.C); return }
 func ConstNUWNeg(v Value) (rv Value)          { rv.C = C.LLVMConstNUWNeg(v.C); return }
 func ConstFNeg(v Value) (rv Value)            { rv.C = C.LLVMConstFNeg(v.C); return }
 func ConstNot(v Value) (rv Value)             { rv.C = C.LLVMConstNot(v.C); return }
 func ConstAdd(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstAdd(lhs.C, rhs.C); return }
 func ConstNSWAdd(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNSWAdd(lhs.C, rhs.C); return }
 func ConstNUWAdd(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNUWAdd(lhs.C, rhs.C); return }
 func ConstFAdd(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFAdd(lhs.C, rhs.C); return }
 func ConstSub(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstSub(lhs.C, rhs.C); return }
 func ConstNSWSub(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNSWSub(lhs.C, rhs.C); return }
 func ConstNUWSub(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNUWSub(lhs.C, rhs.C); return }
 func ConstFSub(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFSub(lhs.C, rhs.C); return }
 func ConstMul(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstMul(lhs.C, rhs.C); return }
 func ConstNSWMul(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNSWMul(lhs.C, rhs.C); return }
 func ConstNUWMul(lhs, rhs Value) (v Value)    { v.C = C.LLVMConstNUWMul(lhs.C, rhs.C); return }
 func ConstFMul(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFMul(lhs.C, rhs.C); return }
 func ConstUDiv(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstUDiv(lhs.C, rhs.C); return }
 func ConstSDiv(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstSDiv(lhs.C, rhs.C); return }
 func ConstExactSDiv(lhs, rhs Value) (v Value) { v.C = C.LLVMConstExactSDiv(lhs.C, rhs.C); return }
 func ConstFDiv(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFDiv(lhs.C, rhs.C); return }
 func ConstURem(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstURem(lhs.C, rhs.C); return }
 func ConstSRem(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstSRem(lhs.C, rhs.C); return }
 func ConstFRem(lhs, rhs Value) (v Value)      { v.C = C.LLVMConstFRem(lhs.C, rhs.C); return }
 func ConstAnd(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstAnd(lhs.C, rhs.C); return }
 func ConstOr(lhs, rhs Value) (v Value)        { v.C = C.LLVMConstOr(lhs.C, rhs.C); return }
 func ConstXor(lhs, rhs Value) (v Value)       { v.C = C.LLVMConstXor(lhs.C, rhs.C); return }
 
 func ConstICmp(pred IntPredicate, lhs, rhs Value) (v Value) {
 	v.C = C.LLVMConstICmp(C.LLVMIntPredicate(pred), lhs.C, rhs.C)
 	return
 }
 func ConstFCmp(pred FloatPredicate, lhs, rhs Value) (v Value) {
 	v.C = C.LLVMConstFCmp(C.LLVMRealPredicate(pred), lhs.C, rhs.C)
 	return
 }
 
 func ConstShl(lhs, rhs Value) (v Value)  { v.C = C.LLVMConstShl(lhs.C, rhs.C); return }
 func ConstLShr(lhs, rhs Value) (v Value) { v.C = C.LLVMConstLShr(lhs.C, rhs.C); return }
 func ConstAShr(lhs, rhs Value) (v Value) { v.C = C.LLVMConstAShr(lhs.C, rhs.C); return }
 
 func ConstGEP(v Value, indices []Value) (rv Value) {
 	ptr, nvals := llvmValueRefs(indices)
 	rv.C = C.LLVMConstGEP(v.C, ptr, nvals)
 	return
 }
 func ConstInBoundsGEP(v Value, indices []Value) (rv Value) {
 	ptr, nvals := llvmValueRefs(indices)
 	rv.C = C.LLVMConstInBoundsGEP(v.C, ptr, nvals)
 	return
 }
 func ConstTrunc(v Value, t Type) (rv Value)         { rv.C = C.LLVMConstTrunc(v.C, t.C); return }
 func ConstSExt(v Value, t Type) (rv Value)          { rv.C = C.LLVMConstSExt(v.C, t.C); return }
 func ConstZExt(v Value, t Type) (rv Value)          { rv.C = C.LLVMConstZExt(v.C, t.C); return }
 func ConstFPTrunc(v Value, t Type) (rv Value)       { rv.C = C.LLVMConstFPTrunc(v.C, t.C); return }
 func ConstFPExt(v Value, t Type) (rv Value)         { rv.C = C.LLVMConstFPExt(v.C, t.C); return }
 func ConstUIToFP(v Value, t Type) (rv Value)        { rv.C = C.LLVMConstUIToFP(v.C, t.C); return }
 func ConstSIToFP(v Value, t Type) (rv Value)        { rv.C = C.LLVMConstSIToFP(v.C, t.C); return }
 func ConstFPToUI(v Value, t Type) (rv Value)        { rv.C = C.LLVMConstFPToUI(v.C, t.C); return }
 func ConstFPToSI(v Value, t Type) (rv Value)        { rv.C = C.LLVMConstFPToSI(v.C, t.C); return }
 func ConstPtrToInt(v Value, t Type) (rv Value)      { rv.C = C.LLVMConstPtrToInt(v.C, t.C); return }
 func ConstIntToPtr(v Value, t Type) (rv Value)      { rv.C = C.LLVMConstIntToPtr(v.C, t.C); return }
 func ConstBitCast(v Value, t Type) (rv Value)       { rv.C = C.LLVMConstBitCast(v.C, t.C); return }
 func ConstZExtOrBitCast(v Value, t Type) (rv Value) { rv.C = C.LLVMConstZExtOrBitCast(v.C, t.C); return }
 func ConstSExtOrBitCast(v Value, t Type) (rv Value) { rv.C = C.LLVMConstSExtOrBitCast(v.C, t.C); return }
 func ConstTruncOrBitCast(v Value, t Type) (rv Value) {
 	rv.C = C.LLVMConstTruncOrBitCast(v.C, t.C)
 	return
 }
 func ConstPointerCast(v Value, t Type) (rv Value) { rv.C = C.LLVMConstPointerCast(v.C, t.C); return }
 func ConstIntCast(v Value, t Type, signed bool) (rv Value) {
 	rv.C = C.LLVMConstIntCast(v.C, t.C, boolToLLVMBool(signed))
 	return
 }
 func ConstFPCast(v Value, t Type) (rv Value) { rv.C = C.LLVMConstFPCast(v.C, t.C); return }
 func ConstSelect(cond, iftrue, iffalse Value) (rv Value) {
 	rv.C = C.LLVMConstSelect(cond.C, iftrue.C, iffalse.C)
 	return
 }
 func ConstExtractElement(vec, i Value) (rv Value) {
 	rv.C = C.LLVMConstExtractElement(vec.C, i.C)
 	return
 }
 func ConstInsertElement(vec, elem, i Value) (rv Value) {
 	rv.C = C.LLVMConstInsertElement(vec.C, elem.C, i.C)
 	return
 }
 func ConstShuffleVector(veca, vecb, mask Value) (rv Value) {
 	rv.C = C.LLVMConstShuffleVector(veca.C, vecb.C, mask.C)
 	return
 }
 
 //TODO
 //LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
 //                                   unsigned NumIdx);
 
 func ConstExtractValue(agg Value, indices []uint32) (rv Value) {
 	n := len(indices)
 	if n == 0 {
 		panic("one or more indices are required")
 	}
 	ptr := (*C.unsigned)(&indices[0])
 	rv.C = C.LLVMConstExtractValue(agg.C, ptr, C.unsigned(n))
 	return
 }
 
 func ConstInsertValue(agg, val Value, indices []uint32) (rv Value) {
 	n := len(indices)
 	if n == 0 {
 		panic("one or more indices are required")
 	}
 	ptr := (*C.unsigned)(&indices[0])
 	rv.C = C.LLVMConstInsertValue(agg.C, val.C, ptr, C.unsigned(n))
 	return
 }
 
 func BlockAddress(f Value, bb BasicBlock) (v Value) {
 	v.C = C.LLVMBlockAddress(f.C, bb.C)
 	return
 }
 
 // Operations on global variables, functions, and aliases (globals)
 func (v Value) GlobalParent() (m Module) { m.C = C.LLVMGetGlobalParent(v.C); return }
 func (v Value) IsDeclaration() bool      { return C.LLVMIsDeclaration(v.C) != 0 }
 func (v Value) Linkage() Linkage         { return Linkage(C.LLVMGetLinkage(v.C)) }
 func (v Value) SetLinkage(l Linkage)     { C.LLVMSetLinkage(v.C, C.LLVMLinkage(l)) }
 func (v Value) Section() string          { return C.GoString(C.LLVMGetSection(v.C)) }
 func (v Value) SetSection(str string) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
 	C.LLVMSetSection(v.C, cstr)
 }
 func (v Value) Visibility() Visibility      { return Visibility(C.LLVMGetVisibility(v.C)) }
 func (v Value) SetVisibility(vi Visibility) { C.LLVMSetVisibility(v.C, C.LLVMVisibility(vi)) }
 func (v Value) Alignment() int              { return int(C.LLVMGetAlignment(v.C)) }
 func (v Value) SetAlignment(a int)          { C.LLVMSetAlignment(v.C, C.unsigned(a)) }
 func (v Value) SetUnnamedAddr(ua bool)      { C.LLVMSetUnnamedAddr(v.C, boolToLLVMBool(ua)) }
 
 // Operations on global variables
 func AddGlobal(m Module, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMAddGlobal(m.C, t.C, cname)
 	return
 }
 func AddGlobalInAddressSpace(m Module, t Type, name string, addressSpace int) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMAddGlobalInAddressSpace(m.C, t.C, cname, C.unsigned(addressSpace))
 	return
 }
 func (m Module) NamedGlobal(name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMGetNamedGlobal(m.C, cname)
 	return
 }
 
 func (m Module) FirstGlobal() (v Value)   { v.C = C.LLVMGetFirstGlobal(m.C); return }
 func (m Module) LastGlobal() (v Value)    { v.C = C.LLVMGetLastGlobal(m.C); return }
 func NextGlobal(v Value) (rv Value)       { rv.C = C.LLVMGetNextGlobal(v.C); return }
 func PrevGlobal(v Value) (rv Value)       { rv.C = C.LLVMGetPreviousGlobal(v.C); return }
 func (v Value) EraseFromParentAsGlobal()  { C.LLVMDeleteGlobal(v.C) }
 func (v Value) Initializer() (rv Value)   { rv.C = C.LLVMGetInitializer(v.C); return }
 func (v Value) SetInitializer(cv Value)   { C.LLVMSetInitializer(v.C, cv.C) }
 func (v Value) IsThreadLocal() bool       { return C.LLVMIsThreadLocal(v.C) != 0 }
 func (v Value) SetThreadLocal(tl bool)    { C.LLVMSetThreadLocal(v.C, boolToLLVMBool(tl)) }
 func (v Value) IsGlobalConstant() bool    { return C.LLVMIsGlobalConstant(v.C) != 0 }
 func (v Value) SetGlobalConstant(gc bool) { C.LLVMSetGlobalConstant(v.C, boolToLLVMBool(gc)) }
 func (v Value) IsVolatile() bool          { return C.LLVMGetVolatile(v.C) != 0 }
 func (v Value) SetVolatile(volatile bool) { C.LLVMSetVolatile(v.C, boolToLLVMBool(volatile)) }
 
 // Operations on aliases
 func AddAlias(m Module, t Type, aliasee Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMAddAlias(m.C, t.C, aliasee.C, cname)
 	return
 }
 
 // Operations on comdat
 func (m Module) Comdat(name string) (c Comdat) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	c.C = C.LLVMGetOrInsertComdat(m.C, cname)
 	return
 }
 
 func (v Value) Comdat() (c Comdat) { c.C = C.LLVMGetComdat(v.C); return }
 func (v Value) SetComdat(c Comdat) { C.LLVMSetComdat(v.C, c.C) }
 
 func (c Comdat) SelectionKind() ComdatSelectionKind {
 	return ComdatSelectionKind(C.LLVMGetComdatSelectionKind(c.C))
 }
 
 func (c Comdat) SetSelectionKind(k ComdatSelectionKind) {
 	C.LLVMSetComdatSelectionKind(c.C, (C.LLVMComdatSelectionKind)(k))
 }
 
 // Operations on functions
 func AddFunction(m Module, name string, ft Type) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMAddFunction(m.C, cname, ft.C)
 	return
 }
 
 func (m Module) NamedFunction(name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMGetNamedFunction(m.C, cname)
 	return
 }
 
 func (m Module) FirstFunction() (v Value)  { v.C = C.LLVMGetFirstFunction(m.C); return }
 func (m Module) LastFunction() (v Value)   { v.C = C.LLVMGetLastFunction(m.C); return }
 func NextFunction(v Value) (rv Value)      { rv.C = C.LLVMGetNextFunction(v.C); return }
 func PrevFunction(v Value) (rv Value)      { rv.C = C.LLVMGetPreviousFunction(v.C); return }
 func (v Value) EraseFromParentAsFunction() { C.LLVMDeleteFunction(v.C) }
 func (v Value) IntrinsicID() int           { return int(C.LLVMGetIntrinsicID(v.C)) }
 func (v Value) FunctionCallConv() CallConv {
 	return CallConv(C.LLVMCallConv(C.LLVMGetFunctionCallConv(v.C)))
 }
 func (v Value) SetFunctionCallConv(cc CallConv) { C.LLVMSetFunctionCallConv(v.C, C.unsigned(cc)) }
 func (v Value) GC() string                      { return C.GoString(C.LLVMGetGC(v.C)) }
 func (v Value) SetGC(name string) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	C.LLVMSetGC(v.C, cname)
 }
 func (v Value) AddAttributeAtIndex(i int, a Attribute) {
   C.LLVMAddAttributeAtIndex(v.C, C.LLVMAttributeIndex(i), a.C)
 }
 func (v Value) AddFunctionAttr(a Attribute) {
   v.AddAttributeAtIndex(C.LLVMAttributeFunctionIndex, a);
 }
 func (v Value) GetEnumAttributeAtIndex(i int, kind uint) (a Attribute) {
   a.C = C.LLVMGetEnumAttributeAtIndex(v.C, C.LLVMAttributeIndex(i), C.unsigned(kind))
   return
 }
 func (v Value) GetEnumFunctionAttribute(kind uint) Attribute {
   return v.GetEnumAttributeAtIndex(C.LLVMAttributeFunctionIndex, kind)
 }
 func (v Value) GetStringAttributeAtIndex(i int, kind string) (a Attribute) {
   ckind := C.CString(kind)
   defer C.free(unsafe.Pointer(ckind))
   a.C = C.LLVMGetStringAttributeAtIndex(v.C, C.LLVMAttributeIndex(i),
                                         ckind, C.unsigned(len(kind)))
   return
 }
 func (v Value) RemoveEnumAttributeAtIndex(i int, kind uint) {
   C.LLVMRemoveEnumAttributeAtIndex(v.C, C.LLVMAttributeIndex(i), C.unsigned(kind))
 }
 func (v Value) RemoveEnumFunctionAttribute(kind uint) {
   v.RemoveEnumAttributeAtIndex(C.LLVMAttributeFunctionIndex, kind);
 }
 func (v Value) RemoveStringAttributeAtIndex(i int, kind string) {
   ckind := C.CString(kind)
   defer C.free(unsafe.Pointer(ckind))
   C.LLVMRemoveStringAttributeAtIndex(v.C, C.LLVMAttributeIndex(i),
                                      ckind, C.unsigned(len(kind)))
 }
 func (v Value) AddTargetDependentFunctionAttr(attr, value string) {
 	cattr := C.CString(attr)
 	defer C.free(unsafe.Pointer(cattr))
 	cvalue := C.CString(value)
 	defer C.free(unsafe.Pointer(cvalue))
 	C.LLVMAddTargetDependentFunctionAttr(v.C, cattr, cvalue)
 }
 func (v Value) SetPersonality(p Value) {
 	C.LLVMSetPersonalityFn(v.C, p.C)
 }
 
 // Operations on parameters
 func (v Value) ParamsCount() int { return int(C.LLVMCountParams(v.C)) }
 func (v Value) Params() []Value {
 	out := make([]Value, v.ParamsCount())
 	if len(out) > 0 {
 		C.LLVMGetParams(v.C, llvmValueRefPtr(&out[0]))
 	}
 	return out
 }
 func (v Value) Param(i int) (rv Value)  { rv.C = C.LLVMGetParam(v.C, C.unsigned(i)); return }
 func (v Value) ParamParent() (rv Value) { rv.C = C.LLVMGetParamParent(v.C); return }
 func (v Value) FirstParam() (rv Value)  { rv.C = C.LLVMGetFirstParam(v.C); return }
 func (v Value) LastParam() (rv Value)   { rv.C = C.LLVMGetLastParam(v.C); return }
 func NextParam(v Value) (rv Value)      { rv.C = C.LLVMGetNextParam(v.C); return }
 func PrevParam(v Value) (rv Value)      { rv.C = C.LLVMGetPreviousParam(v.C); return }
 func (v Value) SetParamAlignment(align int) { C.LLVMSetParamAlignment(v.C, C.unsigned(align)) }
 
 // Operations on basic blocks
 func (bb BasicBlock) AsValue() (v Value)      { v.C = C.LLVMBasicBlockAsValue(bb.C); return }
 func (v Value) IsBasicBlock() bool            { return C.LLVMValueIsBasicBlock(v.C) != 0 }
 func (v Value) AsBasicBlock() (bb BasicBlock) { bb.C = C.LLVMValueAsBasicBlock(v.C); return }
 func (bb BasicBlock) Parent() (v Value)       { v.C = C.LLVMGetBasicBlockParent(bb.C); return }
 func (v Value) BasicBlocksCount() int         { return int(C.LLVMCountBasicBlocks(v.C)) }
 func (v Value) BasicBlocks() []BasicBlock {
 	out := make([]BasicBlock, v.BasicBlocksCount())
 	C.LLVMGetBasicBlocks(v.C, llvmBasicBlockRefPtr(&out[0]))
 	return out
 }
 func (v Value) FirstBasicBlock() (bb BasicBlock)    { bb.C = C.LLVMGetFirstBasicBlock(v.C); return }
 func (v Value) LastBasicBlock() (bb BasicBlock)     { bb.C = C.LLVMGetLastBasicBlock(v.C); return }
 func NextBasicBlock(bb BasicBlock) (rbb BasicBlock) { rbb.C = C.LLVMGetNextBasicBlock(bb.C); return }
 func PrevBasicBlock(bb BasicBlock) (rbb BasicBlock) { rbb.C = C.LLVMGetPreviousBasicBlock(bb.C); return }
 func (v Value) EntryBasicBlock() (bb BasicBlock)    { bb.C = C.LLVMGetEntryBasicBlock(v.C); return }
 func (c Context) AddBasicBlock(f Value, name string) (bb BasicBlock) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	bb.C = C.LLVMAppendBasicBlockInContext(c.C, f.C, cname)
 	return
 }
 func (c Context) InsertBasicBlock(ref BasicBlock, name string) (bb BasicBlock) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	bb.C = C.LLVMInsertBasicBlockInContext(c.C, ref.C, cname)
 	return
 }
 func AddBasicBlock(f Value, name string) (bb BasicBlock) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	bb.C = C.LLVMAppendBasicBlock(f.C, cname)
 	return
 }
 func InsertBasicBlock(ref BasicBlock, name string) (bb BasicBlock) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	bb.C = C.LLVMInsertBasicBlock(ref.C, cname)
 	return
 }
 func (bb BasicBlock) EraseFromParent()          { C.LLVMDeleteBasicBlock(bb.C) }
 func (bb BasicBlock) MoveBefore(pos BasicBlock) { C.LLVMMoveBasicBlockBefore(bb.C, pos.C) }
 func (bb BasicBlock) MoveAfter(pos BasicBlock)  { C.LLVMMoveBasicBlockAfter(bb.C, pos.C) }
 
 // Operations on instructions
 func (v Value) InstructionParent() (bb BasicBlock) { bb.C = C.LLVMGetInstructionParent(v.C); return }
 func (bb BasicBlock) FirstInstruction() (v Value)  { v.C = C.LLVMGetFirstInstruction(bb.C); return }
 func (bb BasicBlock) LastInstruction() (v Value)   { v.C = C.LLVMGetLastInstruction(bb.C); return }
 func NextInstruction(v Value) (rv Value)           { rv.C = C.LLVMGetNextInstruction(v.C); return }
 func PrevInstruction(v Value) (rv Value)           { rv.C = C.LLVMGetPreviousInstruction(v.C); return }
 
 // Operations on call sites
 func (v Value) SetInstructionCallConv(cc CallConv) {
 	C.LLVMSetInstructionCallConv(v.C, C.unsigned(cc))
 }
 func (v Value) InstructionCallConv() CallConv {
 	return CallConv(C.LLVMCallConv(C.LLVMGetInstructionCallConv(v.C)))
 }
 func (v Value) AddCallSiteAttribute(i int, a Attribute) {
 	C.LLVMAddCallSiteAttribute(v.C, C.LLVMAttributeIndex(i), a.C)
 }
 func (v Value) SetInstrParamAlignment(i int, align int) {
 	C.LLVMSetInstrParamAlignment(v.C, C.unsigned(i), C.unsigned(align))
 }
 
 // Operations on call instructions (only)
 func (v Value) IsTailCall() bool    { return C.LLVMIsTailCall(v.C) != 0 }
 func (v Value) SetTailCall(is bool) { C.LLVMSetTailCall(v.C, boolToLLVMBool(is)) }
 
 // Operations on phi nodes
 func (v Value) AddIncoming(vals []Value, blocks []BasicBlock) {
 	ptr, nvals := llvmValueRefs(vals)
 	C.LLVMAddIncoming(v.C, ptr, llvmBasicBlockRefPtr(&blocks[0]), nvals)
 }
 func (v Value) IncomingCount() int { return int(C.LLVMCountIncoming(v.C)) }
 func (v Value) IncomingValue(i int) (rv Value) {
 	rv.C = C.LLVMGetIncomingValue(v.C, C.unsigned(i))
 	return
 }
 func (v Value) IncomingBlock(i int) (bb BasicBlock) {
 	bb.C = C.LLVMGetIncomingBlock(v.C, C.unsigned(i))
 	return
 }
 
 // Operations on inline assembly
 func InlineAsm(t Type, asmString, constraints string, hasSideEffects, isAlignStack bool, dialect InlineAsmDialect) (rv Value) {
 	casm := C.CString(asmString)
 	defer C.free(unsafe.Pointer(casm))
 	cconstraints := C.CString(constraints)
 	defer C.free(unsafe.Pointer(cconstraints))
 	rv.C = C.LLVMGetInlineAsm(t.C, casm, C.size_t(len(asmString)), cconstraints, C.size_t(len(constraints)), boolToLLVMBool(hasSideEffects), boolToLLVMBool(isAlignStack), C.LLVMInlineAsmDialect(dialect))
 	return
 }
 
 // Operations on aggregates
 func (v Value) Indices() []uint32 {
 	num := C.LLVMGetNumIndices(v.C)
 	indicesPtr := C.LLVMGetIndices(v.C)
 	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-	rawIndices := (*[1 << 30]C.uint)(unsafe.Pointer(indicesPtr))[:num:num]
+	rawIndices := (*[1 << 20]C.uint)(unsafe.Pointer(indicesPtr))[:num:num]
 	indices := make([]uint32, num)
 	for i := range indices {
 		indices[i] = uint32(rawIndices[i])
 	}
 	return indices
 }
 
 //-------------------------------------------------------------------------
 // llvm.Builder
 //-------------------------------------------------------------------------
 
 // An instruction builder represents a point within a basic block, and is the
 // exclusive means of building instructions using the C interface.
 
 func (c Context) NewBuilder() (b Builder) { b.C = C.LLVMCreateBuilderInContext(c.C); return }
 func NewBuilder() (b Builder)             { b.C = C.LLVMCreateBuilder(); return }
 func (b Builder) SetInsertPoint(block BasicBlock, instr Value) {
 	C.LLVMPositionBuilder(b.C, block.C, instr.C)
 }
 func (b Builder) SetInsertPointBefore(instr Value)     { C.LLVMPositionBuilderBefore(b.C, instr.C) }
 func (b Builder) SetInsertPointAtEnd(block BasicBlock) { C.LLVMPositionBuilderAtEnd(b.C, block.C) }
 func (b Builder) GetInsertBlock() (bb BasicBlock)      { bb.C = C.LLVMGetInsertBlock(b.C); return }
 func (b Builder) ClearInsertionPoint()                 { C.LLVMClearInsertionPosition(b.C) }
 func (b Builder) Insert(instr Value)                   { C.LLVMInsertIntoBuilder(b.C, instr.C) }
 func (b Builder) InsertWithName(instr Value, name string) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	C.LLVMInsertIntoBuilderWithName(b.C, instr.C, cname)
 }
 func (b Builder) Dispose() { C.LLVMDisposeBuilder(b.C) }
 
 // Metadata
 type DebugLoc struct {
 	Line, Col      uint
 	Scope          Metadata
 	InlinedAt      Metadata
 }
 func (b Builder) SetCurrentDebugLocation(line, col uint, scope, inlinedAt Metadata) {
 	C.LLVMSetCurrentDebugLocation2(b.C, C.unsigned(line), C.unsigned(col), scope.C, inlinedAt.C)
 }
 // Get current debug location. Please do not call this function until setting debug location with SetCurrentDebugLocation()
 func (b Builder) GetCurrentDebugLocation() (loc DebugLoc) {
 	md := C.LLVMGetCurrentDebugLocation2(b.C)
 	loc.Line = uint(md.Line)
 	loc.Col = uint(md.Col)
 	loc.Scope = Metadata{C: md.Scope}
 	loc.InlinedAt = Metadata{C: md.InlinedAt}
 	return
 }
 func (b Builder) SetInstDebugLocation(v Value) { C.LLVMSetInstDebugLocation(b.C, v.C) }
 func (b Builder) InsertDeclare(module Module, storage Value, md Value) Value {
 	f := module.NamedFunction("llvm.dbg.declare")
 	if f.IsNil() {
 		ftyp := FunctionType(VoidType(), []Type{storage.Type(), md.Type()}, false)
 		f = AddFunction(module, "llvm.dbg.declare", ftyp)
 	}
 	return b.CreateCall(f, []Value{storage, md}, "")
 }
 
 // Terminators
 func (b Builder) CreateRetVoid() (rv Value)    { rv.C = C.LLVMBuildRetVoid(b.C); return }
 func (b Builder) CreateRet(v Value) (rv Value) { rv.C = C.LLVMBuildRet(b.C, v.C); return }
 func (b Builder) CreateAggregateRet(vs []Value) (rv Value) {
 	ptr, nvals := llvmValueRefs(vs)
 	rv.C = C.LLVMBuildAggregateRet(b.C, ptr, nvals)
 	return
 }
 func (b Builder) CreateBr(bb BasicBlock) (rv Value) { rv.C = C.LLVMBuildBr(b.C, bb.C); return }
 func (b Builder) CreateCondBr(ifv Value, thenb, elseb BasicBlock) (rv Value) {
 	rv.C = C.LLVMBuildCondBr(b.C, ifv.C, thenb.C, elseb.C)
 	return
 }
 func (b Builder) CreateSwitch(v Value, elseb BasicBlock, numCases int) (rv Value) {
 	rv.C = C.LLVMBuildSwitch(b.C, v.C, elseb.C, C.unsigned(numCases))
 	return
 }
 func (b Builder) CreateIndirectBr(addr Value, numDests int) (rv Value) {
 	rv.C = C.LLVMBuildIndirectBr(b.C, addr.C, C.unsigned(numDests))
 	return
 }
 func (b Builder) CreateInvoke(fn Value, args []Value, then, catch BasicBlock, name string) (rv Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	ptr, nvals := llvmValueRefs(args)
 	rv.C = C.LLVMBuildInvoke(b.C, fn.C, ptr, nvals, then.C, catch.C, cname)
 	return
 }
 func (b Builder) CreateUnreachable() (rv Value) { rv.C = C.LLVMBuildUnreachable(b.C); return }
 
 // Add a case to the switch instruction
 func (v Value) AddCase(on Value, dest BasicBlock) { C.LLVMAddCase(v.C, on.C, dest.C) }
 
 // Add a destination to the indirectbr instruction
 func (v Value) AddDest(dest BasicBlock) { C.LLVMAddDestination(v.C, dest.C) }
 
 // Arithmetic
 func (b Builder) CreateAdd(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildAdd(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateNSWAdd(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildNSWAdd(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateNUWAdd(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildNUWAdd(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateFAdd(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFAdd(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateSub(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildSub(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateNSWSub(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildNSWSub(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateNUWSub(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildNUWSub(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateFSub(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	v.C = C.LLVMBuildFSub(b.C, lhs.C, rhs.C, cname)
 	C.free(unsafe.Pointer(cname))
 	return
 }
 func (b Builder) CreateMul(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildMul(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateNSWMul(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildNSWMul(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateNUWMul(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildNUWMul(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateFMul(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFMul(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateUDiv(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildUDiv(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateSDiv(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildSDiv(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateExactSDiv(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildExactSDiv(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateFDiv(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFDiv(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateURem(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildURem(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateSRem(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildSRem(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateFRem(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFRem(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateShl(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildShl(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateLShr(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildLShr(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateAShr(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildAShr(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateAnd(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildAnd(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateOr(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildOr(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateXor(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildXor(b.C, lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateBinOp(op Opcode, lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildBinOp(b.C, C.LLVMOpcode(op), lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateNeg(v Value, name string) (rv Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	rv.C = C.LLVMBuildNeg(b.C, v.C, cname)
 	return
 }
 func (b Builder) CreateNSWNeg(v Value, name string) (rv Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	rv.C = C.LLVMBuildNSWNeg(b.C, v.C, cname)
 	return
 }
 func (b Builder) CreateNUWNeg(v Value, name string) (rv Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	rv.C = C.LLVMBuildNUWNeg(b.C, v.C, cname)
 	return
 }
 func (b Builder) CreateFNeg(v Value, name string) (rv Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	rv.C = C.LLVMBuildFNeg(b.C, v.C, cname)
 	return
 }
 func (b Builder) CreateNot(v Value, name string) (rv Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	rv.C = C.LLVMBuildNot(b.C, v.C, cname)
 	return
 }
 
 // Memory
 
 func (b Builder) CreateMalloc(t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildMalloc(b.C, t.C, cname)
 	return
 }
 func (b Builder) CreateArrayMalloc(t Type, val Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildArrayMalloc(b.C, t.C, val.C, cname)
 	return
 }
 func (b Builder) CreateAlloca(t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildAlloca(b.C, t.C, cname)
 	return
 }
 func (b Builder) CreateArrayAlloca(t Type, val Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildArrayAlloca(b.C, t.C, val.C, cname)
 	return
 }
 func (b Builder) CreateFree(p Value) (v Value) {
 	v.C = C.LLVMBuildFree(b.C, p.C)
 	return
 }
 func (b Builder) CreateLoad(p Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildLoad(b.C, p.C, cname)
 	return
 }
 func (b Builder) CreateStore(val Value, p Value) (v Value) {
 	v.C = C.LLVMBuildStore(b.C, val.C, p.C)
 	return
 }
 func (b Builder) CreateGEP(p Value, indices []Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	ptr, nvals := llvmValueRefs(indices)
 	v.C = C.LLVMBuildGEP(b.C, p.C, ptr, nvals, cname)
 	return
 }
 func (b Builder) CreateInBoundsGEP(p Value, indices []Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	ptr, nvals := llvmValueRefs(indices)
 	v.C = C.LLVMBuildInBoundsGEP(b.C, p.C, ptr, nvals, cname)
 	return
 }
 func (b Builder) CreateStructGEP(p Value, i int, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildStructGEP(b.C, p.C, C.unsigned(i), cname)
 	return
 }
 func (b Builder) CreateGlobalString(str, name string) (v Value) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildGlobalString(b.C, cstr, cname)
 	return
 }
 func (b Builder) CreateGlobalStringPtr(str, name string) (v Value) {
 	cstr := C.CString(str)
 	defer C.free(unsafe.Pointer(cstr))
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildGlobalStringPtr(b.C, cstr, cname)
 	return
 }
 
 // Casts
 func (b Builder) CreateTrunc(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildTrunc(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateZExt(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildZExt(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateSExt(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildSExt(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateFPToUI(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFPToUI(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateFPToSI(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFPToSI(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateUIToFP(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildUIToFP(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateSIToFP(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildSIToFP(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateFPTrunc(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFPTrunc(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateFPExt(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFPExt(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreatePtrToInt(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildPtrToInt(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateIntToPtr(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildIntToPtr(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateBitCast(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildBitCast(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateZExtOrBitCast(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildZExtOrBitCast(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateSExtOrBitCast(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildSExtOrBitCast(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateTruncOrBitCast(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildTruncOrBitCast(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateCast(val Value, op Opcode, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildCast(b.C, C.LLVMOpcode(op), val.C, t.C, cname)
 	return
 } //
 func (b Builder) CreatePointerCast(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildPointerCast(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateIntCast(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildIntCast(b.C, val.C, t.C, cname)
 	return
 }
 func (b Builder) CreateFPCast(val Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFPCast(b.C, val.C, t.C, cname)
 	return
 }
 
 // Comparisons
 func (b Builder) CreateICmp(pred IntPredicate, lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildICmp(b.C, C.LLVMIntPredicate(pred), lhs.C, rhs.C, cname)
 	return
 }
 func (b Builder) CreateFCmp(pred FloatPredicate, lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildFCmp(b.C, C.LLVMRealPredicate(pred), lhs.C, rhs.C, cname)
 	return
 }
 
 // Miscellaneous instructions
 func (b Builder) CreatePHI(t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildPhi(b.C, t.C, cname)
 	return
 }
 func (b Builder) CreateCall(fn Value, args []Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	ptr, nvals := llvmValueRefs(args)
 	v.C = C.LLVMBuildCall(b.C, fn.C, ptr, nvals, cname)
 	return
 }
 
 func (b Builder) CreateSelect(ifv, thenv, elsev Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildSelect(b.C, ifv.C, thenv.C, elsev.C, cname)
 	return
 }
 
 func (b Builder) CreateVAArg(list Value, t Type, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildVAArg(b.C, list.C, t.C, cname)
 	return
 }
 func (b Builder) CreateExtractElement(vec, i Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildExtractElement(b.C, vec.C, i.C, cname)
 	return
 }
 func (b Builder) CreateInsertElement(vec, elt, i Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildInsertElement(b.C, vec.C, elt.C, i.C, cname)
 	return
 }
 func (b Builder) CreateShuffleVector(v1, v2, mask Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildShuffleVector(b.C, v1.C, v2.C, mask.C, cname)
 	return
 }
 func (b Builder) CreateExtractValue(agg Value, i int, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildExtractValue(b.C, agg.C, C.unsigned(i), cname)
 	return
 }
 func (b Builder) CreateInsertValue(agg, elt Value, i int, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildInsertValue(b.C, agg.C, elt.C, C.unsigned(i), cname)
 	return
 }
 
 func (b Builder) CreateIsNull(val Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildIsNull(b.C, val.C, cname)
 	return
 }
 func (b Builder) CreateIsNotNull(val Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildIsNotNull(b.C, val.C, cname)
 	return
 }
 func (b Builder) CreatePtrDiff(lhs, rhs Value, name string) (v Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	v.C = C.LLVMBuildPtrDiff(b.C, lhs.C, rhs.C, cname)
 	return
 }
 
 func (b Builder) CreateLandingPad(t Type, nclauses int, name string) (l Value) {
 	cname := C.CString(name)
 	defer C.free(unsafe.Pointer(cname))
 	l.C = C.LLVMBuildLandingPad(b.C, t.C, nil, C.unsigned(nclauses), cname)
 	return l
 }
 
 func (l Value) AddClause(v Value) {
 	C.LLVMAddClause(l.C, v.C)
 }
 
 func (l Value) SetCleanup(cleanup bool) {
 	C.LLVMSetCleanup(l.C, boolToLLVMBool(cleanup))
 }
 
 func (b Builder) CreateResume(ex Value) (v Value) {
 	v.C = C.LLVMBuildResume(b.C, ex.C)
 	return
 }
 
 //-------------------------------------------------------------------------
 // llvm.ModuleProvider
 //-------------------------------------------------------------------------
 
 // Changes the type of M so it can be passed to FunctionPassManagers and the
 // JIT. They take ModuleProviders for historical reasons.
 func NewModuleProviderForModule(m Module) (mp ModuleProvider) {
 	mp.C = C.LLVMCreateModuleProviderForExistingModule(m.C)
 	return
 }
 
 // Destroys the module M.
 func (mp ModuleProvider) Dispose() { C.LLVMDisposeModuleProvider(mp.C) }
 
 //-------------------------------------------------------------------------
 // llvm.MemoryBuffer
 //-------------------------------------------------------------------------
 
 func NewMemoryBufferFromFile(path string) (b MemoryBuffer, err error) {
 	var cmsg *C.char
 	cpath := C.CString(path)
 	defer C.free(unsafe.Pointer(cpath))
 	fail := C.LLVMCreateMemoryBufferWithContentsOfFile(cpath, &b.C, &cmsg)
 	if fail != 0 {
 		b.C = nil
 		err = errors.New(C.GoString(cmsg))
 		C.LLVMDisposeMessage(cmsg)
 	}
 	return
 }
 
 func NewMemoryBufferFromStdin() (b MemoryBuffer, err error) {
 	var cmsg *C.char
 	fail := C.LLVMCreateMemoryBufferWithSTDIN(&b.C, &cmsg)
 	if fail != 0 {
 		b.C = nil
 		err = errors.New(C.GoString(cmsg))
 		C.LLVMDisposeMessage(cmsg)
 	}
 	return
 }
 
 func (b MemoryBuffer) Bytes() []byte {
 	cstart := C.LLVMGetBufferStart(b.C)
 	csize := C.LLVMGetBufferSize(b.C)
 	return C.GoBytes(unsafe.Pointer(cstart), C.int(csize))
 }
 
 func (b MemoryBuffer) Dispose() { C.LLVMDisposeMemoryBuffer(b.C) }
 
 //-------------------------------------------------------------------------
 // llvm.PassManager
 //-------------------------------------------------------------------------
 
 // Constructs a new whole-module pass pipeline. This type of pipeline is
 // suitable for link-time optimization and whole-module transformations.
 // See llvm::PassManager::PassManager.
 func NewPassManager() (pm PassManager) { pm.C = C.LLVMCreatePassManager(); return }
 
 // Constructs a new function-by-function pass pipeline over the module
 // provider. It does not take ownership of the module provider. This type of
 // pipeline is suitable for code generation and JIT compilation tasks.
 // See llvm::FunctionPassManager::FunctionPassManager.
 func NewFunctionPassManagerForModule(m Module) (pm PassManager) {
 	pm.C = C.LLVMCreateFunctionPassManagerForModule(m.C)
 	return
 }
 
 // Initializes, executes on the provided module, and finalizes all of the
 // passes scheduled in the pass manager. Returns 1 if any of the passes
 // modified the module, 0 otherwise. See llvm::PassManager::run(Module&).
 func (pm PassManager) Run(m Module) bool { return C.LLVMRunPassManager(pm.C, m.C) != 0 }
 
 // Initializes all of the function passes scheduled in the function pass
 // manager. Returns 1 if any of the passes modified the module, 0 otherwise.
 // See llvm::FunctionPassManager::doInitialization.
 func (pm PassManager) InitializeFunc() bool { return C.LLVMInitializeFunctionPassManager(pm.C) != 0 }
 
 // Executes all of the function passes scheduled in the function pass manager
 // on the provided function. Returns 1 if any of the passes modified the
 // function, false otherwise.
 // See llvm::FunctionPassManager::run(Function&).
 func (pm PassManager) RunFunc(f Value) bool { return C.LLVMRunFunctionPassManager(pm.C, f.C) != 0 }
 
 // Finalizes all of the function passes scheduled in the function pass
 // manager. Returns 1 if any of the passes modified the module, 0 otherwise.
 // See llvm::FunctionPassManager::doFinalization.
 func (pm PassManager) FinalizeFunc() bool { return C.LLVMFinalizeFunctionPassManager(pm.C) != 0 }
 
 // Frees the memory of a pass pipeline. For function pipelines, does not free
 // the module provider.
 // See llvm::PassManagerBase::~PassManagerBase.
 func (pm PassManager) Dispose() { C.LLVMDisposePassManager(pm.C) }
Index: vendor/llvm/dist-release_80/docs/ReleaseNotes.rst
===================================================================
--- vendor/llvm/dist-release_80/docs/ReleaseNotes.rst	(revision 344764)
+++ vendor/llvm/dist-release_80/docs/ReleaseNotes.rst	(revision 344765)
@@ -1,289 +1,264 @@
 ========================
 LLVM 8.0.0 Release Notes
 ========================
 
 .. contents::
     :local:
 
 Introduction
 ============
 
 This document contains the release notes for the LLVM Compiler Infrastructure,
 release 8.0.0.  Here we describe the status of LLVM, including major improvements
 from the previous release, improvements in various subprojects of LLVM, and
 some of the current users of the code.  All LLVM releases may be downloaded
-from the `LLVM releases web site <https://llvm.org/releases/>`_.
+from the `LLVM releases web site <https://releases.llvm.org/>`_.
 
 For more information about LLVM, including information about the latest
 release, please check out the `main LLVM web site <https://llvm.org/>`_.  If you
 have questions or comments, the `LLVM Developer's Mailing List
 <https://lists.llvm.org/mailman/listinfo/llvm-dev>`_ is a good place to send
 them.
 
 Minimum Required Compiler Version
 =================================
 As `discussed on the mailing list
 <https://lists.llvm.org/pipermail/llvm-dev/2019-January/129452.html>`_,
 building LLVM will soon require more recent toolchains as follows:
 
 ============= ====
 Clang         3.5
 Apple Clang   6.0
 GCC           5.1
 Visual Studio 2017
 ============= ====
 
 A new CMake check when configuring LLVM provides a soft-error if your
 toolchain will become unsupported soon. You can opt out of the soft-error by
 setting the ``LLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN`` CMake variable to
 ``ON``.
 
 
+Known Issues
+============
+
+These are issues that couldn't be fixed before the release. See the bug reports
+for the latest status.
+
+* `PR40547 <https://llvm.org/pr40547>`_ Clang gets miscompiled by trunk GCC.
+
+* `PR40761 <https://llvm.org/pr40761>`_ "asan-dynamic" doesn't work on FreeBSD.
+
+
 Non-comprehensive list of changes in this release
 =================================================
-.. NOTE
-   For small 1-3 sentence descriptions, just add an entry at the end of
-   this list. If your description won't fit comfortably in one bullet
-   point (e.g. maybe you would like to give an example of the
-   functionality, or simply have a lot to talk about), see the `NOTE` below
-   for adding a new subsection.
 
 * The **llvm-cov** tool can now export lcov trace files using the
   `-format=lcov` option of the `export` command.
 
 * The ``add_llvm_loadable_module`` CMake macro has been removed.  The
   ``add_llvm_library`` macro with the ``MODULE`` argument now provides the same
   functionality.  See `Writing an LLVM Pass
   <WritingAnLLVMPass.html#setting-up-the-build-environment>`_.
 
 * For MinGW, references to data variables that might need to be imported
   from a dll are accessed via a stub, to allow the linker to convert it to
   a dllimport if needed.
 
 * Added support for labels as offsets in ``.reloc`` directive.
 
 * Support for precise identification of X86 instructions with memory operands,
   by using debug information. This supports profile-driven cache prefetching.
   It is enabled with the ``-x86-discriminate-memops`` LLVM Flag.
 
 * Support for profile-driven software cache prefetching on X86. This is part of
   a larger system, consisting of: an offline cache prefetches recommender,
   AutoFDO tooling, and LLVM. In this system, a binary compiled with
   ``-x86-discriminate-memops`` is run under the observation of the recommender.
   The recommender identifies certain memory access instructions by their binary
   file address, and recommends a prefetch of a specific type (NTA, T0, etc) be
   performed at a specified fixed offset from such an instruction's memory
   operand. Next, this information needs to be converted to the AutoFDO syntax
   and the resulting profile may be passed back to the compiler with the LLVM
   flag ``-prefetch-hints-file``, together with the exact same set of
   compilation parameters used for the original binary. More information is
   available in the `RFC
   <https://lists.llvm.org/pipermail/llvm-dev/2018-November/127461.html>`_.
 
-.. NOTE
-   If you would like to document a larger change, then you can add a
-   subsection about it right here. You can copy the following boilerplate
-   and un-indent it (the indentation causes it to be inside this comment).
+* Windows support for libFuzzer (x86_64).
 
-   Special New Feature
-   -------------------
-
-   Makes programs 10x faster by doing Special New Thing.
-
 Changes to the LLVM IR
 ----------------------
 
 * Function attribute ``speculative_load_hardening`` has been introduced to
   allow indicating that `Speculative Load Hardening
   <SpeculativeLoadHardening.html>`_ must be enabled for the function body.
 
 
 Changes to the AArch64 Target
 -----------------------------
 
 * Support for Speculative Load Hardening has been added.
 
 * Initial support for the Tiny code model, where code and its statically
   defined symbols must live within 1MB of each other.
 
 * Added support for the ``.arch_extension`` assembler directive, just like
   on ARM.
 
 
-Changes to the ARM Backend
---------------------------
-
- During this release ...
-
-
 Changes to the Hexagon Target
 -----------------------------
 
 * Added support for Hexagon/HVX V66 ISA.
 
+
 Changes to the MIPS Target
 --------------------------
 
 * Improved support of GlobalISel instruction selection framework.
 
 * Implemented emission of ``R_MIPS_JALR`` and ``R_MICROMIPS_JALR``
   relocations. These relocations provide hints to a linker for optimization
   of jumps to protected symbols.
 
 * ORC JIT has been supported for MIPS and MIPS64 architectures.
 
 * Assembler now suggests alternative MIPS instruction mnemonics when
   an invalid one is specified.
 
 * Improved support for MIPS N32 ABI.
 
 * Added new instructions (``pll.ps``, ``plu.ps``, ``cvt.s.pu``,
   ``cvt.s.pl``, ``cvt.ps``, ``sigrie``).
 
 * Numerous bug fixes and code cleanups.
 
+
 Changes to the PowerPC Target
 -----------------------------
 
 * Switched to non-PIC default
 
 * Deprecated Darwin support
 
 * Enabled Out-of-Order scheduling for P9
 
 * Better overload rules for compatible vector type parameter
 
-* Support constraint ‘wi’, modifier ‘x’ and VSX registers in inline asm
+* Support constraint 'wi', modifier 'x' and VSX registers in inline asm
 
 * More ``__float128`` support
 
 * Added new builtins like vector int128 ``pack``/``unpack`` and
   ``stxvw4x.be``/``stxvd2x.be``
 
 * Provided significant improvements to the automatic vectorizer
 
 * Code-gen improvements (especially for Power9)
 
 * Fixed some long-standing bugs in the back end
 
 * Added experimental prologue/epilogue improvements
 
 * Enabled builtins tests in compiler-rt
 
 * Add ``___fixunstfti``/``floattitf`` in compiler-rt to support conversion
   between IBM double-double and unsigned int128
 
 * Disable randomized address space when running the sanitizers on Linux ppc64le
 
 * Completed support in LLD for ELFv2
 
 * Enabled llvm-exegesis latency mode for PPC
 
 
 Changes to the X86 Target
 -------------------------
 
 * Machine model for AMD bdver2 (Piledriver) CPU was added. It is used to support
   instruction scheduling and other instruction cost heuristics.
 
 * New AVX512F gather and scatter intrinsics were added that take a <X x i1> mask
   instead of a scalar integer. This removes the need for a bitcast in IR. The
   new intrinsics are named like the old intrinsics with ``llvm.avx512.``
   replaced with ``llvm.avx512.mask.``. The old intrinsics will be removed in a
   future release.
 
 * Added ``cascadelake`` as a CPU name for -march. This is ``skylake-avx512``
   with the addition of the ``avx512vnni`` instruction set.
 
 * ADCX instruction will no longer be emitted. This instruction is rarely better
   than the legacy ADC instruction and just increased code size.
 
-Changes to the AMDGPU Target
------------------------------
 
- During this release ...
-
-Changes to the AVR Target
------------------------------
-
- During this release ...
-
 Changes to the WebAssembly Target
 ---------------------------------
 
 The WebAssembly target is no longer "experimental"! It's now built by default,
 rather than needing to be enabled with LLVM_EXPERIMENTAL_TARGETS_TO_BUILD.
 
 The object file format and core C ABI are now considered stable. That said,
 the object file format has an ABI versioning capability, and one anticipated
 use for it will be to add support for returning small structs as multiple
 return values, once the underlying WebAssembly platform itself supports it.
 Additionally, multithreading support is not yet included in the stable ABI.
 
+
 Changes to the Nios2 Target
 ---------------------------
 
 * The Nios2 target was removed from this release.
 
-Changes to the OCaml bindings
------------------------------
 
-
-
-Changes to the C API
---------------------
-
-
-Changes to the DAG infrastructure
----------------------------------
-
 Changes to LLDB
 ===============
+
 * Printed source code is now syntax highlighted in the terminal (only for C
   languages).
 
 * The expression command now supports tab completing expressions.
 
 
 External Open Source Projects Using LLVM 8
 ==========================================
 
 LDC - the LLVM-based D compiler
 -------------------------------
 
 `D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
 pragmatically combines efficiency, control, and modeling power, with safety and
 programmer productivity. D supports powerful concepts like Compile-Time Function
 Execution (CTFE) and Template Meta-Programming, provides an innovative approach
 to concurrency and offers many classical paradigms.
 
 `LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
 combined with LLVM as backend to produce efficient native code. LDC targets
 x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
 and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
 are underway.
 
 Zig Programming Language
 ------------------------
 
 `Zig <https://ziglang.org>`_  is a system programming language intended to be
 an alternative to C. It provides high level features such as generics, compile
 time function execution, and partial evaluation, while exposing low level LLVM
 IR features such as aliases and intrinsics. Zig uses Clang to provide automatic
 import of .h symbols, including inline functions and simple macros. Zig uses
 LLD combined with lazily building compiler-rt to provide out-of-the-box
 cross-compiling for all supported targets.
 
 
 Additional Information
 ======================
 
 A wide variety of additional information is available on the `LLVM web page
 <https://llvm.org/>`_, in particular in the `documentation
 <https://llvm.org/docs/>`_ section.  The web page also contains versions of the
 API documentation which is up-to-date with the Subversion version of the source
 code.  You can access versions of these documents specific to this release by
 going into the ``llvm/docs/`` directory in the LLVM tree.
 
 If you have any questions or comments about LLVM, please feel free to contact
 us via the `mailing lists <https://llvm.org/docs/#mailing-lists>`_.
Index: vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64AsmPrinter.cpp
===================================================================
--- vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64AsmPrinter.cpp	(revision 344764)
+++ vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64AsmPrinter.cpp	(revision 344765)
@@ -1,993 +1,1002 @@
 //===- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains a printer that converts from our internal representation
 // of machine-dependent LLVM code to the AArch64 assembly language.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
 #include "AArch64MCInstLower.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetObjectFile.h"
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64TargetStreamer.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <map>
 #include <memory>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
 namespace {
 
 class AArch64AsmPrinter : public AsmPrinter {
   AArch64MCInstLower MCInstLowering;
   StackMaps SM;
   const AArch64Subtarget *STI;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
       : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this),
         SM(*this) {}
 
   StringRef getPassName() const override { return "AArch64 Assembly Printer"; }
 
   /// Wrapper for MCInstLowering.lowerOperand() for the
   /// tblgen'erated pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
     return MCInstLowering.lowerOperand(MO, MCOp);
   }
 
   void EmitJumpTableInfo() override;
   void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
                           const MachineBasicBlock *MBB, unsigned JTI);
 
   void LowerJumpTableDestSmall(MCStreamer &OutStreamer, const MachineInstr &MI);
 
   void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                      const MachineInstr &MI);
   void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
                        const MachineInstr &MI);
 
   void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
   void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
   void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
 
   void EmitSled(const MachineInstr &MI, SledKind Kind);
 
   /// tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
 
   void EmitInstruction(const MachineInstr *MI) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AsmPrinter::getAnalysisUsage(AU);
     AU.setPreservesAll();
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     AArch64FI = MF.getInfo<AArch64FunctionInfo>();
     STI = static_cast<const AArch64Subtarget*>(&MF.getSubtarget());
 
     SetupMachineFunction(MF);
 
     if (STI->isTargetCOFF()) {
       bool Internal = MF.getFunction().hasInternalLinkage();
       COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
                                               : COFF::IMAGE_SYM_CLASS_EXTERNAL;
       int Type =
         COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
 
       OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
       OutStreamer->EmitCOFFSymbolStorageClass(Scl);
       OutStreamer->EmitCOFFSymbolType(Type);
       OutStreamer->EndCOFFSymbolDef();
     }
 
     // Emit the rest of the function body.
     EmitFunctionBody();
 
     // Emit the XRay table for this function.
     emitXRayTable();
 
     // We didn't modify anything.
     return false;
   }
 
 private:
   void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
   bool printAsmRegInClass(const MachineOperand &MO,
                           const TargetRegisterClass *RC, bool isVector,
                           raw_ostream &O);
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &O) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
                              unsigned AsmVariant, const char *ExtraCode,
                              raw_ostream &O) override;
 
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
   void EmitFunctionBodyEnd() override;
 
   MCSymbol *GetCPISymbol(unsigned CPID) const override;
   void EmitEndOfAsmFile(Module &M) override;
 
   AArch64FunctionInfo *AArch64FI = nullptr;
 
   /// Emit the LOHs contained in AArch64FI.
   void EmitLOHs();
 
   /// Emit instruction to set float register to zero.
   void EmitFMov0(const MachineInstr &MI);
 
   using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>;
 
   MInstToMCSymbol LOHInstToLabel;
 };
 
 } // end anonymous namespace
 
 void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
 {
   EmitSled(MI, SledKind::FUNCTION_ENTER);
 }
 
 void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI)
 {
   EmitSled(MI, SledKind::FUNCTION_EXIT);
 }
 
 void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
 {
   EmitSled(MI, SledKind::TAIL_CALL);
 }
 
 void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
 {
   static const int8_t NoopsInSledCount = 7;
   // We want to emit the following pattern:
   //
   // .Lxray_sled_N:
   //   ALIGN
   //   B #32
   //   ; 7 NOP instructions (28 bytes)
   // .tmpN
   //
   // We need the 28 bytes (7 instructions) because at runtime, we'd be patching
   // over the full 32 bytes (8 instructions) with the following pattern:
   //
   //   STP X0, X30, [SP, #-16]! ; push X0 and the link register to the stack
   //   LDR W0, #12 ; W0 := function ID
   //   LDR X16,#12 ; X16 := addr of __xray_FunctionEntry or __xray_FunctionExit
   //   BLR X16 ; call the tracing trampoline
   //   ;DATA: 32 bits of function ID
   //   ;DATA: lower 32 bits of the address of the trampoline
   //   ;DATA: higher 32 bits of the address of the trampoline
   //   LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack
   //
   OutStreamer->EmitCodeAlignment(4);
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
   OutStreamer->EmitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
 
   // Emit "B #32" instruction, which jumps over the next 28 bytes.
   // The operand has to be the number of 4-byte instructions to jump over,
   // including the current instruction.
   EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::B).addImm(8));
 
   for (int8_t I = 0; I < NoopsInSledCount; I++)
     EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 
   OutStreamer->EmitLabel(Target);
   recordSled(CurSled, MI, Kind);
 }
 
 void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
   if (TT.isOSBinFormatMachO()) {
     // Funny Darwin hack: This flag tells the linker that no global symbols
     // contain code that falls through to other global symbols (e.g. the obvious
     // implementation of multiple entry points).  If this doesn't occur, the
     // linker can safely perform dead code stripping.  Since LLVM never
     // generates code that does this, it is always safe to set.
     OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
     emitStackMaps(SM);
   }
 }
 
 void AArch64AsmPrinter::EmitLOHs() {
   SmallVector<MCSymbol *, 3> MCArgs;
 
   for (const auto &D : AArch64FI->getLOHContainer()) {
     for (const MachineInstr *MI : D.getArgs()) {
       MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
       assert(LabelIt != LOHInstToLabel.end() &&
              "Label hasn't been inserted for LOH related instruction");
       MCArgs.push_back(LabelIt->second);
     }
     OutStreamer->EmitLOHDirective(D.getKind(), MCArgs);
     MCArgs.clear();
   }
 }
 
 void AArch64AsmPrinter::EmitFunctionBodyEnd() {
   if (!AArch64FI->getLOHRelated().empty())
     EmitLOHs();
 }
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
   // Darwin uses a linker-private symbol name for constant-pools (to
   // avoid addends on the relocation?), ELF has no such concept and
   // uses a normal private symbol.
   if (!getDataLayout().getLinkerPrivateGlobalPrefix().empty())
     return OutContext.getOrCreateSymbol(
         Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
         Twine(getFunctionNumber()) + "_" + Twine(CPID));
 
   return AsmPrinter::GetCPISymbol(CPID);
 }
 
 void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
                                      raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNum);
   switch (MO.getType()) {
   default:
     llvm_unreachable("<unknown operand type>");
   case MachineOperand::MO_Register: {
     unsigned Reg = MO.getReg();
     assert(TargetRegisterInfo::isPhysicalRegister(Reg));
     assert(!MO.getSubReg() && "Subregs should be eliminated!");
     O << AArch64InstPrinter::getRegisterName(Reg);
     break;
   }
   case MachineOperand::MO_Immediate: {
     int64_t Imm = MO.getImm();
     O << '#' << Imm;
     break;
   }
   case MachineOperand::MO_GlobalAddress: {
     const GlobalValue *GV = MO.getGlobal();
     MCSymbol *Sym = getSymbol(GV);
 
     // FIXME: Can we get anything other than a plain symbol here?
     assert(!MO.getTargetFlags() && "Unknown operand target flag!");
 
     Sym->print(O, MAI);
     printOffset(MO.getOffset(), O);
     break;
   }
   case MachineOperand::MO_BlockAddress: {
     MCSymbol *Sym = GetBlockAddressSymbol(MO.getBlockAddress());
     Sym->print(O, MAI);
     break;
   }
   }
 }
 
 bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
                                           raw_ostream &O) {
   unsigned Reg = MO.getReg();
   switch (Mode) {
   default:
     return true; // Unknown mode.
   case 'w':
     Reg = getWRegFromXReg(Reg);
     break;
   case 'x':
     Reg = getXRegFromWReg(Reg);
     break;
   }
 
   O << AArch64InstPrinter::getRegisterName(Reg);
   return false;
 }
 
 // Prints the register in MO using class RC using the offset in the
 // new register class. This should not be used for cross class
 // printing.
 bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
                                            const TargetRegisterClass *RC,
                                            bool isVector, raw_ostream &O) {
   assert(MO.isReg() && "Should only get here with a register!");
   const TargetRegisterInfo *RI = STI->getRegisterInfo();
   unsigned Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
   assert(RI->regsOverlap(RegToPrint, Reg));
   O << AArch64InstPrinter::getRegisterName(
            RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
   return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                         unsigned AsmVariant,
                                         const char *ExtraCode, raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNum);
 
   // First try the generic code, which knows about modifiers like 'c' and 'n'.
   if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
     return false;
 
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0)
       return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
     default:
       return true; // Unknown modifier.
     case 'a':      // Print 'a' modifier
       PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
       return false;
     case 'w':      // Print W register
     case 'x':      // Print X register
       if (MO.isReg())
         return printAsmMRegister(MO, ExtraCode[0], O);
       if (MO.isImm() && MO.getImm() == 0) {
         unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
         O << AArch64InstPrinter::getRegisterName(Reg);
         return false;
       }
       printOperand(MI, OpNum, O);
       return false;
     case 'b': // Print B register.
     case 'h': // Print H register.
     case 's': // Print S register.
     case 'd': // Print D register.
     case 'q': // Print Q register.
       if (MO.isReg()) {
         const TargetRegisterClass *RC;
         switch (ExtraCode[0]) {
         case 'b':
           RC = &AArch64::FPR8RegClass;
           break;
         case 'h':
           RC = &AArch64::FPR16RegClass;
           break;
         case 's':
           RC = &AArch64::FPR32RegClass;
           break;
         case 'd':
           RC = &AArch64::FPR64RegClass;
           break;
         case 'q':
           RC = &AArch64::FPR128RegClass;
           break;
         default:
           return true;
         }
         return printAsmRegInClass(MO, RC, false /* vector */, O);
       }
       printOperand(MI, OpNum, O);
       return false;
     }
   }
 
   // According to ARM, we should emit x and v registers unless we have a
   // modifier.
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
 
     // If this is a w or x register, print an x register.
     if (AArch64::GPR32allRegClass.contains(Reg) ||
         AArch64::GPR64allRegClass.contains(Reg))
       return printAsmMRegister(MO, 'x', O);
 
     // If this is a b, h, s, d, or q register, print it as a v register.
     return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
                               O);
   }
 
   printOperand(MI, OpNum, O);
   return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                               unsigned OpNum,
                                               unsigned AsmVariant,
                                               const char *ExtraCode,
                                               raw_ostream &O) {
   if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a')
     return true; // Unknown modifier.
 
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
   O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
   return false;
 }
 
 void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
                                                raw_ostream &OS) {
   unsigned NOps = MI->getNumOperands();
   assert(NOps == 4);
   OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
   // cast away const; DIetc do not take const operands for some reason.
   OS << cast<DILocalVariable>(MI->getOperand(NOps - 2).getMetadata())
             ->getName();
   OS << " <- ";
   // Frame address.  Currently handles register +- offset only.
   assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
   OS << '[';
   printOperand(MI, 0, OS);
   OS << '+';
   printOperand(MI, 1, OS);
   OS << ']';
   OS << "+";
   printOperand(MI, NOps - 2, OS);
 }
 
 void AArch64AsmPrinter::EmitJumpTableInfo() {
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   if (JT.empty()) return;
 
+  const Function &F = MF->getFunction();
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-  MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(MF->getFunction(), TM);
-  OutStreamer->SwitchSection(ReadOnlySec);
+  bool JTInDiffSection =
+      !STI->isTargetCOFF() ||
+      !TLOF.shouldPutJumpTableInFunctionSection(
+          MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
+          F);
+  if (JTInDiffSection) {
+      // Drop it in the readonly section.
+      MCSection *ReadOnlySec = TLOF.getSectionForJumpTable(F, TM);
+      OutStreamer->SwitchSection(ReadOnlySec);
+  }
 
   auto AFI = MF->getInfo<AArch64FunctionInfo>();
   for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
 
     // If this jump table was deleted, ignore it.
     if (JTBBs.empty()) continue;
 
     unsigned Size = AFI->getJumpTableEntrySize(JTI);
     EmitAlignment(Log2_32(Size));
     OutStreamer->EmitLabel(GetJTISymbol(JTI));
 
     for (auto *JTBB : JTBBs)
       emitJumpTableEntry(MJTI, JTBB, JTI);
   }
 }
 
 void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                            const MachineBasicBlock *MBB,
                                            unsigned JTI) {
   const MCExpr *Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
   auto AFI = MF->getInfo<AArch64FunctionInfo>();
   unsigned Size = AFI->getJumpTableEntrySize(JTI);
 
   if (Size == 4) {
     // .word LBB - LJTI
     const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
     const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, JTI, OutContext);
     Value = MCBinaryExpr::createSub(Value, Base, OutContext);
   } else {
     // .byte (LBB - LBB) >> 2 (or .hword)
     const MCSymbol *BaseSym = AFI->getJumpTableEntryPCRelSymbol(JTI);
     const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, OutContext);
     Value = MCBinaryExpr::createSub(Value, Base, OutContext);
     Value = MCBinaryExpr::createLShr(
         Value, MCConstantExpr::create(2, OutContext), OutContext);
   }
 
   OutStreamer->EmitValue(Value, Size);
 }
 
 /// Small jump tables contain an unsigned byte or half, representing the offset
 /// from the lowest-addressed possible destination to the desired basic
 /// block. Since all instructions are 4-byte aligned, this is further compressed
 /// by counting in instructions rather than bytes (i.e. divided by 4). So, to
 /// materialize the correct destination we need:
 ///
 ///             adr xDest, .LBB0_0
 ///             ldrb wScratch, [xTable, xEntry]   (with "lsl #1" for ldrh).
 ///             add xDest, xDest, xScratch, lsl #2
 void AArch64AsmPrinter::LowerJumpTableDestSmall(llvm::MCStreamer &OutStreamer,
                                                 const llvm::MachineInstr &MI) {
   unsigned DestReg = MI.getOperand(0).getReg();
   unsigned ScratchReg = MI.getOperand(1).getReg();
   unsigned ScratchRegW =
       STI->getRegisterInfo()->getSubReg(ScratchReg, AArch64::sub_32);
   unsigned TableReg = MI.getOperand(2).getReg();
   unsigned EntryReg = MI.getOperand(3).getReg();
   int JTIdx = MI.getOperand(4).getIndex();
   bool IsByteEntry = MI.getOpcode() == AArch64::JumpTableDest8;
 
   // This has to be first because the compression pass based its reachability
   // calculations on the start of the JumpTableDest instruction.
   auto Label =
       MF->getInfo<AArch64FunctionInfo>()->getJumpTableEntryPCRelSymbol(JTIdx);
   EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADR)
                                   .addReg(DestReg)
                                   .addExpr(MCSymbolRefExpr::create(
                                       Label, MF->getContext())));
 
   // Load the number of instruction-steps to offset from the label.
   unsigned LdrOpcode = IsByteEntry ? AArch64::LDRBBroX : AArch64::LDRHHroX;
   EmitToStreamer(OutStreamer, MCInstBuilder(LdrOpcode)
                                   .addReg(ScratchRegW)
                                   .addReg(TableReg)
                                   .addReg(EntryReg)
                                   .addImm(0)
                                   .addImm(IsByteEntry ? 0 : 1));
 
   // Multiply the steps by 4 and add to the already materialized base label
   // address.
   EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::ADDXrs)
                                   .addReg(DestReg)
                                   .addReg(DestReg)
                                   .addReg(ScratchReg)
                                   .addImm(2));
 }
 
 void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                                       const MachineInstr &MI) {
   unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
 
   SM.recordStackMap(MI);
   assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
 
   // Scan ahead to trim the shadow.
   const MachineBasicBlock &MBB = *MI.getParent();
   MachineBasicBlock::const_iterator MII(MI);
   ++MII;
   while (NumNOPBytes > 0) {
     if (MII == MBB.end() || MII->isCall() ||
         MII->getOpcode() == AArch64::DBG_VALUE ||
         MII->getOpcode() == TargetOpcode::PATCHPOINT ||
         MII->getOpcode() == TargetOpcode::STACKMAP)
       break;
     ++MII;
     NumNOPBytes -= 4;
   }
 
   // Emit nops.
   for (unsigned i = 0; i < NumNOPBytes; i += 4)
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
 
 // Lower a patchpoint of the form:
 // [<def>], <id>, <numBytes>, <target>, <numArgs>
 void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
                                         const MachineInstr &MI) {
   SM.recordPatchPoint(MI);
 
   PatchPointOpers Opers(&MI);
 
   int64_t CallTarget = Opers.getCallTarget().getImm();
   unsigned EncodedBytes = 0;
   if (CallTarget) {
     assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
            "High 16 bits of call target should be zero.");
     unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
     EncodedBytes = 16;
     // Materialize the jump address:
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 32) & 0xFFFF)
                                     .addImm(32));
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 16) & 0xFFFF)
                                     .addImm(16));
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm(CallTarget & 0xFFFF)
                                     .addImm(0));
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
   }
   // Emit padding.
   unsigned NumBytes = Opers.getNumPatchBytes();
   assert(NumBytes >= EncodedBytes &&
          "Patchpoint can't request size less than the length of a call.");
   assert((NumBytes - EncodedBytes) % 4 == 0 &&
          "Invalid number of NOP bytes requested!");
   for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
 
 void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
   unsigned DestReg = MI.getOperand(0).getReg();
   if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) {
     // Convert H/S/D register to corresponding Q register
     if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
       DestReg = AArch64::Q0 + (DestReg - AArch64::H0);
     else if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31)
       DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
     else {
       assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
       DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
     }
     MCInst MOVI;
     MOVI.setOpcode(AArch64::MOVIv2d_ns);
     MOVI.addOperand(MCOperand::createReg(DestReg));
     MOVI.addOperand(MCOperand::createImm(0));
     EmitToStreamer(*OutStreamer, MOVI);
   } else {
     MCInst FMov;
     switch (MI.getOpcode()) {
     default: llvm_unreachable("Unexpected opcode");
     case AArch64::FMOVH0:
       FMov.setOpcode(AArch64::FMOVWHr);
       FMov.addOperand(MCOperand::createReg(DestReg));
       FMov.addOperand(MCOperand::createReg(AArch64::WZR));
       break;
     case AArch64::FMOVS0:
       FMov.setOpcode(AArch64::FMOVWSr);
       FMov.addOperand(MCOperand::createReg(DestReg));
       FMov.addOperand(MCOperand::createReg(AArch64::WZR));
       break;
     case AArch64::FMOVD0:
       FMov.setOpcode(AArch64::FMOVXDr);
       FMov.addOperand(MCOperand::createReg(DestReg));
       FMov.addOperand(MCOperand::createReg(AArch64::XZR));
       break;
     }
     EmitToStreamer(*OutStreamer, FMov);
   }
 }
 
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
 
 void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   // Do any auto-generated pseudo lowerings.
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
 
   if (AArch64FI->getLOHRelated().count(MI)) {
     // Generate a label for LOH related instruction
     MCSymbol *LOHLabel = createTempSymbol("loh");
     // Associate the instruction with the label
     LOHInstToLabel[MI] = LOHLabel;
     OutStreamer->EmitLabel(LOHLabel);
   }
 
   AArch64TargetStreamer *TS =
     static_cast<AArch64TargetStreamer *>(OutStreamer->getTargetStreamer());
   // Do any manual lowerings.
   switch (MI->getOpcode()) {
   default:
     break;
     case AArch64::MOVMCSym: {
     unsigned DestReg = MI->getOperand(0).getReg();
     const MachineOperand &MO_Sym = MI->getOperand(1);
     MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym);
     MCOperand Hi_MCSym, Lo_MCSym;
 
     Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S);
     Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC);
 
     MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym);
     MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym);
 
     MCInst MovZ;
     MovZ.setOpcode(AArch64::MOVZXi);
     MovZ.addOperand(MCOperand::createReg(DestReg));
     MovZ.addOperand(Hi_MCSym);
     MovZ.addOperand(MCOperand::createImm(16));
     EmitToStreamer(*OutStreamer, MovZ);
 
     MCInst MovK;
     MovK.setOpcode(AArch64::MOVKXi);
     MovK.addOperand(MCOperand::createReg(DestReg));
     MovK.addOperand(MCOperand::createReg(DestReg));
     MovK.addOperand(Lo_MCSym);
     MovK.addOperand(MCOperand::createImm(0));
     EmitToStreamer(*OutStreamer, MovK);
     return;
   }
   case AArch64::MOVIv2d_ns:
     // If the target has <rdar://problem/16473581>, lower this
     // instruction to movi.16b instead.
     if (STI->hasZeroCycleZeroingFPWorkaround() &&
         MI->getOperand(1).getImm() == 0) {
       MCInst TmpInst;
       TmpInst.setOpcode(AArch64::MOVIv16b_ns);
       TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
       TmpInst.addOperand(MCOperand::createImm(MI->getOperand(1).getImm()));
       EmitToStreamer(*OutStreamer, TmpInst);
       return;
     }
     break;
 
   case AArch64::DBG_VALUE: {
     if (isVerbose() && OutStreamer->hasRawTextSupport()) {
       SmallString<128> TmpStr;
       raw_svector_ostream OS(TmpStr);
       PrintDebugValueComment(MI, OS);
       OutStreamer->EmitRawText(StringRef(OS.str()));
     }
     return;
 
   case AArch64::EMITBKEY: {
       ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType();
       if (ExceptionHandlingType != ExceptionHandling::DwarfCFI &&
           ExceptionHandlingType != ExceptionHandling::ARM)
         return;
 
       if (needsCFIMoves() == CFI_M_None)
         return;
 
       OutStreamer->EmitCFIBKeyFrame();
       return;
     }
   }
 
   // Tail calls use pseudo instructions so they have the proper code-gen
   // attributes (isCall, isReturn, etc.). We lower them to the real
   // instruction here.
   case AArch64::TCRETURNri:
   case AArch64::TCRETURNriBTI:
   case AArch64::TCRETURNriALL: {
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::BR);
     TmpInst.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
   case AArch64::TCRETURNdi: {
     MCOperand Dest;
     MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::B);
     TmpInst.addOperand(Dest);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
   case AArch64::TLSDESC_CALLSEQ: {
     /// lower this to:
     ///    adrp  x0, :tlsdesc:var
     ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
     ///    add   x0, x0, #:tlsdesc_lo12:var
     ///    .tlsdesccall var
     ///    blr   x1
     ///    (TPIDR_EL0 offset now in x0)
     const MachineOperand &MO_Sym = MI->getOperand(0);
     MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
     MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
     MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
     MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
     MCInstLowering.lowerOperand(MO_Sym, Sym);
     MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
     MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc);
 
     MCInst Adrp;
     Adrp.setOpcode(AArch64::ADRP);
     Adrp.addOperand(MCOperand::createReg(AArch64::X0));
     Adrp.addOperand(SymTLSDesc);
     EmitToStreamer(*OutStreamer, Adrp);
 
     MCInst Ldr;
     Ldr.setOpcode(AArch64::LDRXui);
     Ldr.addOperand(MCOperand::createReg(AArch64::X1));
     Ldr.addOperand(MCOperand::createReg(AArch64::X0));
     Ldr.addOperand(SymTLSDescLo12);
     Ldr.addOperand(MCOperand::createImm(0));
     EmitToStreamer(*OutStreamer, Ldr);
 
     MCInst Add;
     Add.setOpcode(AArch64::ADDXri);
     Add.addOperand(MCOperand::createReg(AArch64::X0));
     Add.addOperand(MCOperand::createReg(AArch64::X0));
     Add.addOperand(SymTLSDescLo12);
     Add.addOperand(MCOperand::createImm(AArch64_AM::getShiftValue(0)));
     EmitToStreamer(*OutStreamer, Add);
 
     // Emit a relocation-annotation. This expands to no code, but requests
     // the following instruction gets an R_AARCH64_TLSDESC_CALL.
     MCInst TLSDescCall;
     TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
     TLSDescCall.addOperand(Sym);
     EmitToStreamer(*OutStreamer, TLSDescCall);
 
     MCInst Blr;
     Blr.setOpcode(AArch64::BLR);
     Blr.addOperand(MCOperand::createReg(AArch64::X1));
     EmitToStreamer(*OutStreamer, Blr);
 
     return;
   }
 
   case AArch64::JumpTableDest32: {
     // We want:
     //     ldrsw xScratch, [xTable, xEntry, lsl #2]
     //     add xDest, xTable, xScratch
     unsigned DestReg = MI->getOperand(0).getReg(),
              ScratchReg = MI->getOperand(1).getReg(),
              TableReg = MI->getOperand(2).getReg(),
              EntryReg = MI->getOperand(3).getReg();
     EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::LDRSWroX)
                                      .addReg(ScratchReg)
                                      .addReg(TableReg)
                                      .addReg(EntryReg)
                                      .addImm(0)
                                      .addImm(1));
     EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::ADDXrs)
                                      .addReg(DestReg)
                                      .addReg(TableReg)
                                      .addReg(ScratchReg)
                                      .addImm(0));
     return;
   }
   case AArch64::JumpTableDest16:
   case AArch64::JumpTableDest8:
     LowerJumpTableDestSmall(*OutStreamer, *MI);
     return;
 
   case AArch64::FMOVH0:
   case AArch64::FMOVS0:
   case AArch64::FMOVD0:
     EmitFMov0(*MI);
     return;
 
   case TargetOpcode::STACKMAP:
     return LowerSTACKMAP(*OutStreamer, SM, *MI);
 
   case TargetOpcode::PATCHPOINT:
     return LowerPATCHPOINT(*OutStreamer, SM, *MI);
 
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
     LowerPATCHABLE_FUNCTION_ENTER(*MI);
     return;
 
   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
     LowerPATCHABLE_FUNCTION_EXIT(*MI);
     return;
 
   case TargetOpcode::PATCHABLE_TAIL_CALL:
     LowerPATCHABLE_TAIL_CALL(*MI);
     return;
 
   case AArch64::SEH_StackAlloc:
     TS->EmitARM64WinCFIAllocStack(MI->getOperand(0).getImm());
     return;
 
   case AArch64::SEH_SaveFPLR:
     TS->EmitARM64WinCFISaveFPLR(MI->getOperand(0).getImm());
     return;
 
   case AArch64::SEH_SaveFPLR_X:
     assert(MI->getOperand(0).getImm() < 0 &&
            "Pre increment SEH opcode must have a negative offset");
     TS->EmitARM64WinCFISaveFPLRX(-MI->getOperand(0).getImm());
     return;
 
   case AArch64::SEH_SaveReg:
     TS->EmitARM64WinCFISaveReg(MI->getOperand(0).getImm(),
                                MI->getOperand(1).getImm());
     return;
 
   case AArch64::SEH_SaveReg_X:
     assert(MI->getOperand(1).getImm() < 0 &&
            "Pre increment SEH opcode must have a negative offset");
     TS->EmitARM64WinCFISaveRegX(MI->getOperand(0).getImm(),
 		                -MI->getOperand(1).getImm());
     return;
 
   case AArch64::SEH_SaveRegP:
     assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
             "Non-consecutive registers not allowed for save_regp");
     TS->EmitARM64WinCFISaveRegP(MI->getOperand(0).getImm(),
                                 MI->getOperand(2).getImm());
     return;
 
   case AArch64::SEH_SaveRegP_X:
     assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
             "Non-consecutive registers not allowed for save_regp_x");
     assert(MI->getOperand(2).getImm() < 0 &&
            "Pre increment SEH opcode must have a negative offset");
     TS->EmitARM64WinCFISaveRegPX(MI->getOperand(0).getImm(),
                                  -MI->getOperand(2).getImm());
     return;
 
   case AArch64::SEH_SaveFReg:
     TS->EmitARM64WinCFISaveFReg(MI->getOperand(0).getImm(),
                                 MI->getOperand(1).getImm());
     return;
 
   case AArch64::SEH_SaveFReg_X:
     assert(MI->getOperand(1).getImm() < 0 &&
            "Pre increment SEH opcode must have a negative offset");
     TS->EmitARM64WinCFISaveFRegX(MI->getOperand(0).getImm(),
                                  -MI->getOperand(1).getImm());
     return;
 
   case AArch64::SEH_SaveFRegP:
     assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
             "Non-consecutive registers not allowed for save_regp");
     TS->EmitARM64WinCFISaveFRegP(MI->getOperand(0).getImm(),
                                  MI->getOperand(2).getImm());
     return;
 
   case AArch64::SEH_SaveFRegP_X:
     assert((MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1) &&
             "Non-consecutive registers not allowed for save_regp_x");
     assert(MI->getOperand(2).getImm() < 0 &&
            "Pre increment SEH opcode must have a negative offset");
     TS->EmitARM64WinCFISaveFRegPX(MI->getOperand(0).getImm(),
                                   -MI->getOperand(2).getImm());
     return;
 
   case AArch64::SEH_SetFP:
     TS->EmitARM64WinCFISetFP();
     return;
 
   case AArch64::SEH_AddFP:
     TS->EmitARM64WinCFIAddFP(MI->getOperand(0).getImm());
     return;
 
   case AArch64::SEH_Nop:
     TS->EmitARM64WinCFINop();
     return;
 
   case AArch64::SEH_PrologEnd:
     TS->EmitARM64WinCFIPrologEnd();
     return;
 
   case AArch64::SEH_EpilogStart:
     TS->EmitARM64WinCFIEpilogStart();
     return;
 
   case AArch64::SEH_EpilogEnd:
     TS->EmitARM64WinCFIEpilogEnd();
     return;
   }
 
   // Finally, do the automated lowerings for everything else.
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
 // Force static initialization.
 extern "C" void LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
 }
Index: vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64FrameLowering.cpp
===================================================================
--- vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64FrameLowering.cpp	(revision 344764)
+++ vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64FrameLowering.cpp	(revision 344765)
@@ -1,2160 +1,2159 @@
 //===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the AArch64 implementation of TargetFrameLowering class.
 //
 // On AArch64, stack frames are structured as follows:
 //
 // The stack grows downward.
 //
 // All of the individual frame areas on the frame below are optional, i.e. it's
 // possible to create a function so that the particular area isn't present
 // in the frame.
 //
 // At function entry, the "frame" looks as follows:
 //
 // |                                   | Higher address
 // |-----------------------------------|
 // |                                   |
 // | arguments passed on the stack     |
 // |                                   |
 // |-----------------------------------| <- sp
 // |                                   | Lower address
 //
 //
 // After the prologue has run, the frame has the following general structure.
 // Note that this doesn't depict the case where a red-zone is used. Also,
 // technically the last frame area (VLAs) doesn't get created until in the
 // main function body, after the prologue is run. However, it's depicted here
 // for completeness.
 //
 // |                                   | Higher address
 // |-----------------------------------|
 // |                                   |
 // | arguments passed on the stack     |
 // |                                   |
 // |-----------------------------------|
 // |                                   |
 // | (Win64 only) varargs from reg     |
 // |                                   |
 // |-----------------------------------|
 // |                                   |
 // | prev_fp, prev_lr                  |
 // | (a.k.a. "frame record")           |
 // |-----------------------------------| <- fp(=x29)
 // |                                   |
 // | other callee-saved registers      |
 // |                                   |
 // |-----------------------------------|
 // |.empty.space.to.make.part.below....|
 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
 // |.the.standard.16-byte.alignment....|  compile time; if present)
 // |-----------------------------------|
 // |                                   |
 // | local variables of fixed size     |
 // | including spill slots             |
 // |-----------------------------------| <- bp(not defined by ABI,
 // |.variable-sized.local.variables....|       LLVM chooses X19)
 // |.(VLAs)............................| (size of this area is unknown at
 // |...................................|  compile time)
 // |-----------------------------------| <- sp
 // |                                   | Lower address
 //
 //
 // To access the data in a frame, at-compile time, a constant offset must be
 // computable from one of the pointers (fp, bp, sp) to access it. The size
 // of the areas with a dotted background cannot be computed at compile-time
 // if they are present, making it required to have all three of fp, bp and
 // sp to be set up to be able to access all contents in the frame areas,
 // assuming all of the frame areas are non-empty.
 //
 // For most functions, some of the frame areas are empty. For those functions,
 // it may not be necessary to set up fp or bp:
 // * A base pointer is definitely needed when there are both VLAs and local
 //   variables with more-than-default alignment requirements.
 // * A frame pointer is definitely needed when there are local variables with
 //   more-than-default alignment requirements.
 //
 // In some cases when a base pointer is not strictly needed, it is generated
 // anyway when offsets from the frame pointer to access local variables become
 // so large that the offset can't be encoded in the immediate fields of loads
 // or stores.
 //
 // FIXME: also explain the redzone concept.
 // FIXME: also explain the concept of reserved call frames.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
 #include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "frame-info"
 
 static cl::opt<bool> EnableRedZone("aarch64-redzone",
                                    cl::desc("enable use of redzone on AArch64"),
                                    cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     ReverseCSRRestoreSeq("reverse-csr-restore-seq",
                          cl::desc("reverse the CSR restore sequence"),
                          cl::init(false), cl::Hidden);
 
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
 /// This is the biggest offset to the stack pointer we can encode in aarch64
 /// instructions (without using a separate calculation and a temp register).
 /// Note that the exception here are vector stores/loads which cannot encode any
 /// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
 static const unsigned DefaultSafeSPDisplacement = 255;
 
 /// Look at each instruction that references stack frames and return the stack
 /// size limit beyond which some of these instructions will require a scratch
 /// register during their expansion later.
 static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
   // FIXME: For now, just conservatively guestimate based on unscaled indexing
   // range. We'll end up allocating an unnecessary spill slot a lot, but
   // realistically that's not a big deal at this stage of the game.
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
       if (MI.isDebugInstr() || MI.isPseudo() ||
           MI.getOpcode() == AArch64::ADDXri ||
           MI.getOpcode() == AArch64::ADDSXri)
         continue;
 
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isFI())
           continue;
 
         int Offset = 0;
         if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
             AArch64FrameOffsetCannotUpdate)
           return 0;
       }
     }
   }
   return DefaultSafeSPDisplacement;
 }
 
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   if (!EnableRedZone)
     return false;
   // Don't use the red zone if the function explicitly asks us not to.
   // This is typically used for kernel code.
   if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
     return false;
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned NumBytes = AFI->getLocalStackSize();
 
   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128);
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
 /// pointer register.
 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   // Win64 EH requires a frame pointer if funclets are present, as the locals
   // are accessed off the frame pointer in both the parent function and the
   // funclets.
   if (MF.hasEHFunclets())
     return true;
   // Retain behavior of always omitting the FP for leaf functions when possible.
   if (MFI.hasCalls() && MF.getTarget().Options.DisableFramePointerElim(MF))
     return true;
   if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
       MFI.hasStackMap() || MFI.hasPatchPoint() ||
       RegInfo->needsStackRealignment(MF))
     return true;
   // With large callframes around we may need to use FP to access the scavenging
   // emergency spillslot.
   //
   // Unfortunately some calls to hasFP() like machine verifier ->
   // getReservedReg() -> hasFP in the middle of global isel are too early
   // to know the max call frame size. Hopefully conservatively returning "true"
   // in those cases is fine.
   // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
   if (!MFI.isMaxCallFrameSizeComputed() ||
       MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement)
     return true;
 
   // Win64 SEH requires frame pointer if funclets are present.
   if (MF.hasLocalEscape())
     return true;
 
   return false;
 }
 
 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
 /// not required, we reserve argument space for call sites in the function
 /// immediately on entry to the current function.  This eliminates the need for
 /// add/sub sp brackets around call sites.  Returns true if the call frame is
 /// included as part of the stack frame.
 bool
 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo().hasVarSizedObjects();
 }
 
 MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
   const AArch64InstrInfo *TII =
       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
   DebugLoc DL = I->getDebugLoc();
   unsigned Opc = I->getOpcode();
   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
 
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   if (!TFI->hasReservedCallFrame(MF)) {
     unsigned Align = getStackAlignment();
 
     int64_t Amount = I->getOperand(0).getImm();
     Amount = alignTo(Amount, Align);
     if (!IsDestroy)
       Amount = -Amount;
 
     // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
     // doesn't have to pop anything), then the first operand will be zero too so
     // this adjustment is a no-op.
     if (CalleePopAmount == 0) {
       // FIXME: in-function stack adjustment for calls is limited to 24-bits
       // because there's no guaranteed temporary register available.
       //
       // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
       // 1) For offset <= 12-bit, we use LSL #0
       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
       // LSL #0, and the other uses LSL #12.
       //
       // Most call frames will be allocated at the start of a function so
       // this is OK, but it is a limitation that needs dealing with.
       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
     }
   } else if (CalleePopAmount != 0) {
     // If the calling convention demands that the callee pops arguments from the
     // stack, we want to add it back if we have a reserved call frame.
     assert(CalleePopAmount < 0xffffff && "call frame too large");
     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
                     TII);
   }
   return MBB.erase(I);
 }
 
 static bool ShouldSignReturnAddress(MachineFunction &MF) {
   // The function should be signed in the following situations:
   // - sign-return-address=all
   // - sign-return-address=non-leaf and the functions spills the LR
 
   const Function &F = MF.getFunction();
   if (!F.hasFnAttribute("sign-return-address"))
     return false;
 
   StringRef Scope = F.getFnAttribute("sign-return-address").getValueAsString();
   if (Scope.equals("none"))
     return false;
 
   if (Scope.equals("all"))
     return true;
 
   assert(Scope.equals("non-leaf") && "Expected all, none or non-leaf");
 
   for (const auto &Info : MF.getFrameInfo().getCalleeSavedInfo())
     if (Info.getReg() == AArch64::LR)
       return true;
 
   return false;
 }
 
 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetSubtargetInfo &STI = MF.getSubtarget();
   const MCRegisterInfo *MRI = STI.getRegisterInfo();
   const TargetInstrInfo *TII = STI.getInstrInfo();
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   if (CSI.empty())
     return;
 
   for (const auto &Info : CSI) {
     unsigned Reg = Info.getReg();
     int64_t Offset =
         MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
     unsigned CFIIndex = MF.addFrameInst(
         MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlags(MachineInstr::FrameSetup);
   }
 }
 
 // Find a scratch register that we can use at the start of the prologue to
 // re-align the stack pointer.  We avoid using callee-save registers since they
 // may appear to be free when this is called from canUseAsPrologue (during
 // shrink wrapping), but then no longer be free when this is called from
 // emitPrologue.
 //
 // FIXME: This is a bit conservative, since in the above case we could use one
 // of the callee-save registers as a scratch temp to re-align the stack pointer,
 // but we would then have to make sure that we were in fact saving at least one
 // callee-save register in the prologue, which is additional complexity that
 // doesn't seem worth the benefit.
 static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
   MachineFunction *MF = MBB->getParent();
 
   // If MBB is an entry block, use X9 as the scratch register
   if (&MF->front() == MBB)
     return AArch64::X9;
 
   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
   LivePhysRegs LiveRegs(TRI);
   LiveRegs.addLiveIns(*MBB);
 
   // Mark callee saved registers as used so we will not choose them.
   const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
   for (unsigned i = 0; CSRegs[i]; ++i)
     LiveRegs.addReg(CSRegs[i]);
 
   // Prefer X9 since it was historically used for the prologue scratch reg.
   const MachineRegisterInfo &MRI = MF->getRegInfo();
   if (LiveRegs.available(MRI, AArch64::X9))
     return AArch64::X9;
 
   for (unsigned Reg : AArch64::GPR64RegClass) {
     if (LiveRegs.available(MRI, Reg))
       return Reg;
   }
   return AArch64::NoRegister;
 }
 
 bool AArch64FrameLowering::canUseAsPrologue(
     const MachineBasicBlock &MBB) const {
   const MachineFunction *MF = MBB.getParent();
   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   // Don't need a scratch register if we're not going to re-align the stack.
   if (!RegInfo->needsStackRealignment(*MF))
     return true;
   // Otherwise, we can use any block as long as it has a scratch register
   // available.
   return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
 }
 
 static bool windowsRequiresStackProbe(MachineFunction &MF,
                                       unsigned StackSizeInBytes) {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   if (!Subtarget.isTargetWindows())
     return false;
   const Function &F = MF.getFunction();
   // TODO: When implementing stack protectors, take that into account
   // for the probe threshold.
   unsigned StackProbeSize = 4096;
   if (F.hasFnAttribute("stack-probe-size"))
     F.getFnAttribute("stack-probe-size")
         .getValueAsString()
         .getAsInteger(0, StackProbeSize);
   return (StackSizeInBytes >= StackProbeSize) &&
          !F.hasFnAttribute("no-stack-arg-probe");
 }
 
 bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
     MachineFunction &MF, unsigned StackBumpBytes) const {
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   if (AFI->getLocalStackSize() == 0)
     return false;
 
   // 512 is the maximum immediate for stp/ldp that will be used for
   // callee-save save/restores
   if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
     return false;
 
   if (MFI.hasVarSizedObjects())
     return false;
 
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
   // This isn't strictly necessary, but it simplifies things a bit since the
   // current RedZone handling code assumes the SP is adjusted by the
   // callee-save save/restore code.
   if (canUseRedZone(MF))
     return false;
 
   return true;
 }
 
 // Given a load or a store instruction, generate an appropriate unwinding SEH
 // code on Windows.
 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
                                              const TargetInstrInfo &TII,
                                              MachineInstr::MIFlag Flag) {
   unsigned Opc = MBBI->getOpcode();
   MachineBasicBlock *MBB = MBBI->getParent();
   MachineFunction &MF = *MBB->getParent();
   DebugLoc DL = MBBI->getDebugLoc();
   unsigned ImmIdx = MBBI->getNumOperands() - 1;
   int Imm = MBBI->getOperand(ImmIdx).getImm();
   MachineInstrBuilder MIB;
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   switch (Opc) {
   default:
     llvm_unreachable("No SEH Opcode for this instruction");
   case AArch64::LDPDpost:
     Imm = -Imm;
     LLVM_FALLTHROUGH;
   case AArch64::STPDpre: {
     unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
     unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
               .addImm(Reg0)
               .addImm(Reg1)
               .addImm(Imm * 8)
               .setMIFlag(Flag);
     break;
   }
   case AArch64::LDPXpost:
     Imm = -Imm;
     LLVM_FALLTHROUGH;
   case AArch64::STPXpre: {
     unsigned Reg0 = MBBI->getOperand(1).getReg();
     unsigned Reg1 = MBBI->getOperand(2).getReg();
     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
                 .addImm(Imm * 8)
                 .setMIFlag(Flag);
     else
       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
                 .addImm(RegInfo->getSEHRegNum(Reg0))
                 .addImm(RegInfo->getSEHRegNum(Reg1))
                 .addImm(Imm * 8)
                 .setMIFlag(Flag);
     break;
   }
   case AArch64::LDRDpost:
     Imm = -Imm;
     LLVM_FALLTHROUGH;
   case AArch64::STRDpre: {
     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
               .addImm(Reg)
               .addImm(Imm)
               .setMIFlag(Flag);
     break;
   }
   case AArch64::LDRXpost:
     Imm = -Imm;
     LLVM_FALLTHROUGH;
   case AArch64::STRXpre: {
     unsigned Reg =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
               .addImm(Reg)
               .addImm(Imm)
               .setMIFlag(Flag);
     break;
   }
   case AArch64::STPDi:
   case AArch64::LDPDi: {
     unsigned Reg0 =  RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
     unsigned Reg1 =  RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
               .addImm(Reg0)
               .addImm(Reg1)
               .addImm(Imm * 8)
               .setMIFlag(Flag);
     break;
   }
   case AArch64::STPXi:
   case AArch64::LDPXi: {
     unsigned Reg0 = MBBI->getOperand(0).getReg();
     unsigned Reg1 = MBBI->getOperand(1).getReg();
     if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
                 .addImm(Imm * 8)
                 .setMIFlag(Flag);
     else
       MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
                 .addImm(RegInfo->getSEHRegNum(Reg0))
                 .addImm(RegInfo->getSEHRegNum(Reg1))
                 .addImm(Imm * 8)
                 .setMIFlag(Flag);
     break;
   }
   case AArch64::STRXui:
   case AArch64::LDRXui: {
     int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
               .addImm(Reg)
               .addImm(Imm * 8)
               .setMIFlag(Flag);
     break;
   }
   case AArch64::STRDui:
   case AArch64::LDRDui: {
     unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
     MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
               .addImm(Reg)
               .addImm(Imm * 8)
               .setMIFlag(Flag);
     break;
   }
   }
   auto I = MBB->insertAfter(MBBI, MIB);
   return I;
 }
 
 // Fix up the SEH opcode associated with the save/restore instruction.
 static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
                            unsigned LocalStackSize) {
   MachineOperand *ImmOpnd = nullptr;
   unsigned ImmIdx = MBBI->getNumOperands() - 1;
   switch (MBBI->getOpcode()) {
   default:
     llvm_unreachable("Fix the offset in the SEH instruction");
   case AArch64::SEH_SaveFPLR:
   case AArch64::SEH_SaveRegP:
   case AArch64::SEH_SaveReg:
   case AArch64::SEH_SaveFRegP:
   case AArch64::SEH_SaveFReg:
     ImmOpnd = &MBBI->getOperand(ImmIdx);
     break;
   }
   if (ImmOpnd)
     ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
 }
 
 // Convert callee-save register save/restore instruction to do stack pointer
 // decrement/increment to allocate/deallocate the callee-save stack area by
 // converting store/load to use pre/post increment version.
 static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
     bool NeedsWinCFI, bool InProlog = true) {
   // Ignore instructions that do not operate on SP, i.e. shadow call stack
   // instructions and associated CFI instruction.
   while (MBBI->getOpcode() == AArch64::STRXpost ||
          MBBI->getOpcode() == AArch64::LDRXpre ||
          MBBI->getOpcode() == AArch64::CFI_INSTRUCTION) {
     if (MBBI->getOpcode() != AArch64::CFI_INSTRUCTION)
       assert(MBBI->getOperand(0).getReg() != AArch64::SP);
     ++MBBI;
   }
   unsigned NewOpc;
   int Scale = 1;
   switch (MBBI->getOpcode()) {
   default:
     llvm_unreachable("Unexpected callee-save save/restore opcode!");
   case AArch64::STPXi:
     NewOpc = AArch64::STPXpre;
     Scale = 8;
     break;
   case AArch64::STPDi:
     NewOpc = AArch64::STPDpre;
     Scale = 8;
     break;
   case AArch64::STPQi:
     NewOpc = AArch64::STPQpre;
     Scale = 16;
     break;
   case AArch64::STRXui:
     NewOpc = AArch64::STRXpre;
     break;
   case AArch64::STRDui:
     NewOpc = AArch64::STRDpre;
     break;
   case AArch64::STRQui:
     NewOpc = AArch64::STRQpre;
     break;
   case AArch64::LDPXi:
     NewOpc = AArch64::LDPXpost;
     Scale = 8;
     break;
   case AArch64::LDPDi:
     NewOpc = AArch64::LDPDpost;
     Scale = 8;
     break;
   case AArch64::LDPQi:
     NewOpc = AArch64::LDPQpost;
     Scale = 16;
     break;
   case AArch64::LDRXui:
     NewOpc = AArch64::LDRXpost;
     break;
   case AArch64::LDRDui:
     NewOpc = AArch64::LDRDpost;
     break;
   case AArch64::LDRQui:
     NewOpc = AArch64::LDRQpost;
     break;
   }
   // Get rid of the SEH code associated with the old instruction.
   if (NeedsWinCFI) {
     auto SEH = std::next(MBBI);
     if (AArch64InstrInfo::isSEHInstruction(*SEH))
       SEH->eraseFromParent();
   }
 
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
   MIB.addReg(AArch64::SP, RegState::Define);
 
   // Copy all operands other than the immediate offset.
   unsigned OpndIdx = 0;
   for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
        ++OpndIdx)
     MIB.add(MBBI->getOperand(OpndIdx));
 
   assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
          "Unexpected immediate offset in first/last callee-save save/restore "
          "instruction!");
   assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
          "Unexpected base register in callee-save save/restore instruction!");
   assert(CSStackSizeInc % Scale == 0);
   MIB.addImm(CSStackSizeInc / Scale);
 
   MIB.setMIFlags(MBBI->getFlags());
   MIB.setMemRefs(MBBI->memoperands());
 
   // Generate a new SEH code that corresponds to the new instruction.
   if (NeedsWinCFI)
     InsertSEH(*MIB, *TII,
               InProlog ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy);
 
   return std::prev(MBB.erase(MBBI));
 }
 
 // Fixup callee-save register save/restore instructions to take into account
 // combined SP bump by adding the local stack size to the stack offsets.
 static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
                                               unsigned LocalStackSize,
                                               bool NeedsWinCFI) {
   if (AArch64InstrInfo::isSEHInstruction(MI))
     return;
 
   unsigned Opc = MI.getOpcode();
 
   // Ignore instructions that do not operate on SP, i.e. shadow call stack
   // instructions and associated CFI instruction.
   if (Opc == AArch64::STRXpost || Opc == AArch64::LDRXpre ||
       Opc == AArch64::CFI_INSTRUCTION) {
     if (Opc != AArch64::CFI_INSTRUCTION)
       assert(MI.getOperand(0).getReg() != AArch64::SP);
     return;
   }
 
   unsigned Scale;
   switch (Opc) {
   case AArch64::STPXi:
   case AArch64::STRXui:
   case AArch64::STPDi:
   case AArch64::STRDui:
   case AArch64::LDPXi:
   case AArch64::LDRXui:
   case AArch64::LDPDi:
   case AArch64::LDRDui:
     Scale = 8;
     break;
   case AArch64::STPQi:
   case AArch64::STRQui:
   case AArch64::LDPQi:
   case AArch64::LDRQui:
     Scale = 16;
     break;
   default:
     llvm_unreachable("Unexpected callee-save save/restore opcode!");
   }
 
   unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
   assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
          "Unexpected base register in callee-save save/restore instruction!");
   // Last operand is immediate offset that needs fixing.
   MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
   // All generated opcodes have scaled offsets.
   assert(LocalStackSize % Scale == 0);
   OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
 
   if (NeedsWinCFI) {
     auto MBBI = std::next(MachineBasicBlock::iterator(MI));
     assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
     assert(AArch64InstrInfo::isSEHInstruction(*MBBI) &&
            "Expecting a SEH instruction");
     fixupSEHOpcode(MBBI, LocalStackSize);
   }
 }
 
 static void adaptForLdStOpt(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator FirstSPPopI,
                             MachineBasicBlock::iterator LastPopI) {
   // Sometimes (when we restore in the same order as we save), we can end up
   // with code like this:
   //
   // ldp      x26, x25, [sp]
   // ldp      x24, x23, [sp, #16]
   // ldp      x22, x21, [sp, #32]
   // ldp      x20, x19, [sp, #48]
   // add      sp, sp, #64
   //
   // In this case, it is always better to put the first ldp at the end, so
   // that the load-store optimizer can run and merge the ldp and the add into
   // a post-index ldp.
   // If we managed to grab the first pop instruction, move it to the end.
   if (ReverseCSRRestoreSeq)
     MBB.splice(FirstSPPopI, &MBB, LastPopI);
   // We should end up with something like this now:
   //
   // ldp      x24, x23, [sp, #16]
   // ldp      x22, x21, [sp, #32]
   // ldp      x20, x19, [sp, #48]
   // ldp      x26, x25, [sp]
   // add      sp, sp, #64
   //
   // and the load-store optimizer can merge the last two instructions into:
   //
   // ldp      x26, x25, [sp], #64
   //
 }
 
 static bool ShouldSignWithAKey(MachineFunction &MF) {
   const Function &F = MF.getFunction();
   if (!F.hasFnAttribute("sign-return-address-key"))
     return true;
 
   const StringRef Key =
       F.getFnAttribute("sign-return-address-key").getValueAsString();
   assert(Key.equals_lower("a_key") || Key.equals_lower("b_key"));
   return Key.equals_lower("a_key");
 }
 
 static bool needsWinCFI(const MachineFunction &MF) {
   const Function &F = MF.getFunction();
   return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
          F.needsUnwindTableEntry();
 }
 
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const Function &F = MF.getFunction();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool needsFrameMoves = (MMI.hasDebugInfo() || F.needsUnwindTableEntry()) &&
                          !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool HasFP = hasFP(MF);
   bool NeedsWinCFI = needsWinCFI(MF);
   MF.setHasWinCFI(NeedsWinCFI);
   bool IsFunclet = MBB.isEHFuncletEntry();
 
   // At this point, we're going to decide whether or not the function uses a
   // redzone. In most cases, the function doesn't have a redzone so let's
   // assume that's false and set it to true in the case that there's a redzone.
   AFI->setHasRedZone(false);
 
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
   DebugLoc DL;
 
   if (ShouldSignReturnAddress(MF)) {
     if (ShouldSignWithAKey(MF))
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP))
           .setMIFlag(MachineInstr::FrameSetup);
     else {
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY))
           .setMIFlag(MachineInstr::FrameSetup);
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP))
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
     unsigned CFIIndex =
         MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlags(MachineInstr::FrameSetup);
   }
 
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
   // getStackSize() includes all the locals in its size calculation. We don't
   // include these locals when computing the stack size of a funclet, as they
   // are allocated in the parent's stack frame and accessed via the frame
   // pointer from the funclet.  We only save the callee saved registers in the
   // funclet, which are really the callee saved registers of the parent
   // function, including the funclet.
   int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF)
                            : (int)MFI.getStackSize();
   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
     // All of the stack allocation is for locals.
     AFI->setLocalStackSize(NumBytes);
     if (!NumBytes)
       return;
     // REDZONE: If the stack size is less than 128 bytes, we don't need
     // to actually allocate.
     if (canUseRedZone(MF)) {
       AFI->setHasRedZone(true);
       ++NumRedZoneFunctions;
     } else {
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
                       MachineInstr::FrameSetup, false, NeedsWinCFI);
       if (!NeedsWinCFI) {
         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
         // Encode the stack size of the leaf function.
         unsigned CFIIndex = MF.addFrameInst(
             MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
         BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex)
             .setMIFlags(MachineInstr::FrameSetup);
       }
     }
 
     if (NeedsWinCFI)
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
           .setMIFlag(MachineInstr::FrameSetup);
 
     return;
   }
 
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
   // Var args are accounted for in the containing function, so don't
   // include them for funclets.
   unsigned FixedObject = (IsWin64 && !IsFunclet) ?
                          alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
 
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   if (CombineSPBump) {
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
                     MachineInstr::FrameSetup, false, NeedsWinCFI);
     NumBytes = 0;
   } else if (PrologueSaveSize != 0) {
     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
         MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI);
     NumBytes -= PrologueSaveSize;
   }
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   // Move past the saves of the callee-saved registers, fixing up the offsets
   // and pre-inc if we decided to combine the callee-save and local stack
   // pointer bump above.
   MachineBasicBlock::iterator End = MBB.end();
   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
     if (CombineSPBump)
       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
                                         NeedsWinCFI);
     ++MBBI;
   }
 
   // The code below is not applicable to funclets. We have emitted all the SEH
   // opcodes that we needed to emit.  The FP and BP belong to the containing
   // function.
   if (IsFunclet) {
     if (NeedsWinCFI)
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
           .setMIFlag(MachineInstr::FrameSetup);
     return;
   }
 
   if (HasFP) {
     // Only set up FP if we actually need to. Frame pointer is fp =
     // sp - fixedobject - 16.
     int FPOffset = AFI->getCalleeSavedStackSize() - 16;
     if (CombineSPBump)
       FPOffset += AFI->getLocalStackSize();
 
     // Issue    sub fp, sp, FPOffset or
     //          mov fp,sp          when FPOffset is zero.
     // Note: All stores of callee-saved registers are marked as "FrameSetup".
     // This code marks the instruction(s) that set the FP also.
     emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
                     MachineInstr::FrameSetup, false, NeedsWinCFI);
   }
 
   if (windowsRequiresStackProbe(MF, NumBytes)) {
     uint32_t NumWords = NumBytes >> 4;
     if (NeedsWinCFI) {
       // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
       // exceed this amount.  We need to move at most 2^24 - 1 into x15.
       // This is at most two instructions, MOVZ follwed by MOVK.
       // TODO: Fix to use multiple stack alloc unwind codes for stacks
       // exceeding 256MB in size.
       if (NumBytes >= (1 << 28))
         report_fatal_error("Stack size cannot exceed 256MB for stack "
                             "unwinding purposes");
 
       uint32_t LowNumWords = NumWords & 0xFFFF;
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
             .addImm(LowNumWords)
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
             .setMIFlag(MachineInstr::FrameSetup);
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
             .setMIFlag(MachineInstr::FrameSetup);
       if ((NumWords & 0xFFFF0000) != 0) {
           BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
               .addReg(AArch64::X15)
               .addImm((NumWords & 0xFFFF0000) >> 16) // High half
               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 16))
               .setMIFlag(MachineInstr::FrameSetup);
           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
             .setMIFlag(MachineInstr::FrameSetup);
       }
     } else {
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
           .addImm(NumWords)
           .setMIFlags(MachineInstr::FrameSetup);
     }
 
     switch (MF.getTarget().getCodeModel()) {
     case CodeModel::Tiny:
     case CodeModel::Small:
     case CodeModel::Medium:
     case CodeModel::Kernel:
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
           .addExternalSymbol("__chkstk")
           .addReg(AArch64::X15, RegState::Implicit)
           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
           .setMIFlags(MachineInstr::FrameSetup);
       if (NeedsWinCFI)
         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
             .setMIFlag(MachineInstr::FrameSetup);
       break;
     case CodeModel::Large:
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
           .addReg(AArch64::X16, RegState::Define)
           .addExternalSymbol("__chkstk")
           .addExternalSymbol("__chkstk")
           .setMIFlags(MachineInstr::FrameSetup);
       if (NeedsWinCFI)
         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
             .setMIFlag(MachineInstr::FrameSetup);
 
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
           .addReg(AArch64::X16, RegState::Kill)
           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
           .addReg(AArch64::X17, RegState::Implicit | RegState::Define | RegState::Dead)
           .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define | RegState::Dead)
           .setMIFlags(MachineInstr::FrameSetup);
       if (NeedsWinCFI)
         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
             .setMIFlag(MachineInstr::FrameSetup);
       break;
     }
 
     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
         .addReg(AArch64::SP, RegState::Kill)
         .addReg(AArch64::X15, RegState::Kill)
         .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 4))
         .setMIFlags(MachineInstr::FrameSetup);
     if (NeedsWinCFI)
        BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
             .addImm(NumBytes)
             .setMIFlag(MachineInstr::FrameSetup);
     NumBytes = 0;
   }
 
   // Allocate space for the rest of the frame.
   if (NumBytes) {
     const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
     unsigned scratchSPReg = AArch64::SP;
 
     if (NeedsRealignment) {
       scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
       assert(scratchSPReg != AArch64::NoRegister);
     }
 
     // If we're a leaf function, try using the red zone.
     if (!canUseRedZone(MF))
       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
       emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
                       MachineInstr::FrameSetup, false, NeedsWinCFI);
 
     if (NeedsRealignment) {
       const unsigned Alignment = MFI.getMaxAlignment();
       const unsigned NrBitsToZero = countTrailingZeros(Alignment);
       assert(NrBitsToZero > 1);
       assert(scratchSPReg != AArch64::SP);
 
       // SUB X9, SP, NumBytes
       //   -- X9 is temporary register, so shouldn't contain any live data here,
       //   -- free to use. This is already produced by emitFrameOffset above.
       // AND SP, X9, 0b11111...0000
       // The logical immediates have a non-trivial encoding. The following
       // formula computes the encoded immediate with all ones but
       // NrBitsToZero zero bits as least significant bits.
       uint32_t andMaskEncoded = (1 << 12)                         // = N
                                 | ((64 - NrBitsToZero) << 6)      // immr
                                 | ((64 - NrBitsToZero - 1) << 0); // imms
 
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
           .addReg(scratchSPReg, RegState::Kill)
           .addImm(andMaskEncoded);
       AFI->setStackRealigned(true);
       if (NeedsWinCFI)
         BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
             .addImm(NumBytes & andMaskEncoded)
             .setMIFlag(MachineInstr::FrameSetup);
     }
   }
 
   // If we need a base pointer, set it up here. It's whatever the value of the
   // stack pointer is at this point. Any variable size objects will be allocated
   // after this, so we can still use the base pointer to reference locals.
   //
   // FIXME: Clarify FrameSetup flags here.
   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
   // needed.
   if (RegInfo->hasBasePointer(MF)) {
     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
                      false);
     if (NeedsWinCFI)
       BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
           .setMIFlag(MachineInstr::FrameSetup);
   }
 
   // The very last FrameSetup instruction indicates the end of prologue. Emit a
   // SEH opcode indicating the prologue end.
   if (NeedsWinCFI)
     BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
         .setMIFlag(MachineInstr::FrameSetup);
 
   if (needsFrameMoves) {
     const DataLayout &TD = MF.getDataLayout();
     const int StackGrowth = -TD.getPointerSize(0);
     unsigned FramePtr = RegInfo->getFrameRegister(MF);
     // An example of the prologue:
     //
     //     .globl __foo
     //     .align 2
     //  __foo:
     // Ltmp0:
     //     .cfi_startproc
     //     .cfi_personality 155, ___gxx_personality_v0
     // Leh_func_begin:
     //     .cfi_lsda 16, Lexception33
     //
     //     stp  xa,bx, [sp, -#offset]!
     //     ...
     //     stp  x28, x27, [sp, #offset-32]
     //     stp  fp, lr, [sp, #offset-16]
     //     add  fp, sp, #offset - 16
     //     sub  sp, sp, #1360
     //
     // The Stack:
     //       +-------------------------------------------+
     // 10000 | ........ | ........ | ........ | ........ |
     // 10004 | ........ | ........ | ........ | ........ |
     //       +-------------------------------------------+
     // 10008 | ........ | ........ | ........ | ........ |
     // 1000c | ........ | ........ | ........ | ........ |
     //       +===========================================+
     // 10010 |                X28 Register               |
     // 10014 |                X28 Register               |
     //       +-------------------------------------------+
     // 10018 |                X27 Register               |
     // 1001c |                X27 Register               |
     //       +===========================================+
     // 10020 |                Frame Pointer              |
     // 10024 |                Frame Pointer              |
     //       +-------------------------------------------+
     // 10028 |                Link Register              |
     // 1002c |                Link Register              |
     //       +===========================================+
     // 10030 | ........ | ........ | ........ | ........ |
     // 10034 | ........ | ........ | ........ | ........ |
     //       +-------------------------------------------+
     // 10038 | ........ | ........ | ........ | ........ |
     // 1003c | ........ | ........ | ........ | ........ |
     //       +-------------------------------------------+
     //
     //     [sp] = 10030        ::    >>initial value<<
     //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
     //     fp = sp == 10020    ::  mov fp, sp
     //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
     //     sp == 10010         ::    >>final value<<
     //
     // The frame pointer (w29) points to address 10020. If we use an offset of
     // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
     // for w27, and -32 for w28:
     //
     //  Ltmp1:
     //     .cfi_def_cfa w29, 16
     //  Ltmp2:
     //     .cfi_offset w30, -8
     //  Ltmp3:
     //     .cfi_offset w29, -16
     //  Ltmp4:
     //     .cfi_offset w27, -24
     //  Ltmp5:
     //     .cfi_offset w28, -32
 
     if (HasFP) {
       // Define the current CFA rule to use the provided FP.
       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
           nullptr, Reg, 2 * StackGrowth - FixedObject));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
     } else {
       // Encode the stack size of the leaf function.
       unsigned CFIIndex = MF.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
     }
 
     // Now emit the moves for whatever callee saved regs we have (including FP,
     // LR if those are saved).
     emitCalleeSavedFrameMoves(MBB, MBBI);
   }
 }
 
 static void InsertReturnAddressAuth(MachineFunction &MF,
                                     MachineBasicBlock &MBB) {
   if (!ShouldSignReturnAddress(MF))
     return;
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   DebugLoc DL;
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
 
   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
   // this instruction can safely used for any v8a architecture.
   // From v8.3a onwards there are optimised authenticate LR and return
   // instructions, namely RETA{A,B}, that can be used instead.
   if (Subtarget.hasV8_3aOps() && MBBI != MBB.end() &&
       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
     BuildMI(MBB, MBBI, DL,
             TII->get(ShouldSignWithAKey(MF) ? AArch64::RETAA : AArch64::RETAB))
         .copyImplicitOps(*MBBI);
     MBB.erase(MBBI);
   } else {
     BuildMI(
         MBB, MBBI, DL,
         TII->get(ShouldSignWithAKey(MF) ? AArch64::AUTIASP : AArch64::AUTIBSP))
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 }
 
 static bool isFuncletReturnInstr(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::CATCHRET:
   case AArch64::CLEANUPRET:
     return true;
   }
 }
 
 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
   bool IsTailCallReturn = false;
   bool NeedsWinCFI = needsWinCFI(MF);
   bool IsFunclet = false;
 
   if (MBB.end() != MBBI) {
     DL = MBBI->getDebugLoc();
     unsigned RetOpcode = MBBI->getOpcode();
     IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
                        RetOpcode == AArch64::TCRETURNri ||
                        RetOpcode == AArch64::TCRETURNriBTI;
     IsFunclet = isFuncletReturnInstr(*MBBI);
   }
 
   int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF)
                            : MFI.getStackSize();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
 
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
   // Initial and residual are named for consistency with the prologue. Note that
   // in the epilogue, the residual adjustment is executed first.
   uint64_t ArgumentPopSize = 0;
   if (IsTailCallReturn) {
     MachineOperand &StackAdjust = MBBI->getOperand(1);
 
     // For a tail-call in a callee-pops-arguments environment, some or all of
     // the stack may actually be in use for the call's arguments, this is
     // calculated during LowerCall and consumed here...
     ArgumentPopSize = StackAdjust.getImm();
   } else {
     // ... otherwise the amount to pop is *all* of the argument space,
     // conveniently stored in the MachineFunctionInfo by
     // LowerFormalArguments. This will, of course, be zero for the C calling
     // convention.
     ArgumentPopSize = AFI->getArgumentStackToRestore();
   }
 
   // The stack frame should be like below,
   //
   //      ----------------------                     ---
   //      |                    |                      |
   //      | BytesInStackArgArea|              CalleeArgStackSize
   //      | (NumReusableBytes) |                (of tail call)
   //      |                    |                     ---
   //      |                    |                      |
   //      ---------------------|        ---           |
   //      |                    |         |            |
   //      |   CalleeSavedReg   |         |            |
   //      | (CalleeSavedStackSize)|      |            |
   //      |                    |         |            |
   //      ---------------------|         |         NumBytes
   //      |                    |     StackSize  (StackAdjustUp)
   //      |   LocalStackSize   |         |            |
   //      | (covering callee   |         |            |
   //      |       args)        |         |            |
   //      |                    |         |            |
   //      ----------------------        ---          ---
   //
   // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
   //             = StackSize + ArgumentPopSize
   //
   // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
   // it as the 2nd argument of AArch64ISD::TC_RETURN.
 
   auto Cleanup = make_scope_exit([&] { InsertReturnAddressAuth(MF, MBB); });
 
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
   // Var args are accounted for in the containing function, so don't
   // include them for funclets.
   unsigned FixedObject =
       (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
 
   uint64_t AfterCSRPopSize = ArgumentPopSize;
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // We cannot rely on the local stack size set in emitPrologue if the function
   // has funclets, as funclets have different local stack size requirements, and
   // the current value set in emitPrologue may be that of the containing
   // function.
   if (MF.hasEHFunclets())
     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
 
   if (!CombineSPBump && PrologueSaveSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
     while (AArch64InstrInfo::isSEHInstruction(*Pop))
       Pop = std::prev(Pop);
     // Converting the last ldp to a post-index ldp is valid only if the last
     // ldp's offset is 0.
     const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
     // If the offset is 0, convert it to a post-index ldp.
     if (OffsetOp.getImm() == 0)
       convertCalleeSaveRestoreToSPPrePostIncDec(
           MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, false);
     else {
       // If not, make sure to emit an add after the last ldp.
       // We're doing this by transfering the size to be restored from the
       // adjustment *before* the CSR pops to the adjustment *after* the CSR
       // pops.
       AfterCSRPopSize += PrologueSaveSize;
     }
   }
 
   // Move past the restores of the callee-saved registers.
   // If we plan on combining the sp bump of the local stack size and the callee
   // save stack size, we might need to adjust the CSR save and restore offsets.
   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
   MachineBasicBlock::iterator Begin = MBB.begin();
   while (LastPopI != Begin) {
     --LastPopI;
     if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
       ++LastPopI;
       break;
     } else if (CombineSPBump)
       fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
                                         NeedsWinCFI);
   }
 
   if (NeedsWinCFI)
     BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
         .setMIFlag(MachineInstr::FrameDestroy);
 
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
                     NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy,
                     false, NeedsWinCFI);
     if (NeedsWinCFI)
       BuildMI(MBB, MBB.getFirstTerminator(), DL,
               TII->get(AArch64::SEH_EpilogEnd))
           .setMIFlag(MachineInstr::FrameDestroy);
     return;
   }
 
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   if (!hasFP(MF)) {
     bool RedZone = canUseRedZone(MF);
     // If this was a redzone leaf function, we don't need to restore the
     // stack pointer (but we may need to pop stack args for fastcc).
     if (RedZone && AfterCSRPopSize == 0)
       return;
 
     bool NoCalleeSaveRestore = PrologueSaveSize == 0;
     int StackRestoreBytes = RedZone ? 0 : NumBytes;
     if (NoCalleeSaveRestore)
       StackRestoreBytes += AfterCSRPopSize;
 
     // If we were able to combine the local stack pop with the argument pop,
     // then we're done.
     bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
 
     // If we're done after this, make sure to help the load store optimizer.
     if (Done)
       adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
 
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
                     StackRestoreBytes, TII, MachineInstr::FrameDestroy, false,
                     NeedsWinCFI);
     if (Done) {
       if (NeedsWinCFI)
         BuildMI(MBB, MBB.getFirstTerminator(), DL,
                 TII->get(AArch64::SEH_EpilogEnd))
             .setMIFlag(MachineInstr::FrameDestroy);
       return;
     }
 
     NumBytes = 0;
   }
 
   // Restore the original stack pointer.
   // FIXME: Rather than doing the math here, we should instead just use
   // non-post-indexed loads for the restores if we aren't actually going to
   // be able to save any instructions.
   if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned()))
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
                     -AFI->getCalleeSavedStackSize() + 16, TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
   else if (NumBytes)
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI);
 
   // This must be placed after the callee-save restore code because that code
   // assumes the SP is at the same location as it was after the callee-save save
   // code in the prologue.
   if (AfterCSRPopSize) {
     // Find an insertion point for the first ldp so that it goes before the
     // shadow call stack epilog instruction. This ensures that the restore of
     // lr from x18 is placed after the restore from sp.
     auto FirstSPPopI = MBB.getFirstTerminator();
     while (FirstSPPopI != Begin) {
       auto Prev = std::prev(FirstSPPopI);
       if (Prev->getOpcode() != AArch64::LDRXpre ||
           Prev->getOperand(0).getReg() == AArch64::SP)
         break;
       FirstSPPopI = Prev;
     }
 
     adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
 
     emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
                     AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false,
                     NeedsWinCFI);
   }
   if (NeedsWinCFI)
     BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd))
         .setMIFlag(MachineInstr::FrameDestroy);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
 /// debug info.  It's the same as what we use for resolving the code-gen
 /// references for now.  FIXME: This can go wrong when references are
 /// SP-relative and simple call frames aren't used.
 int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                  int FI,
                                                  unsigned &FrameReg) const {
   return resolveFrameIndexReference(MF, FI, FrameReg);
 }
 
 int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
                                                      int FI, unsigned &FrameReg,
                                                      bool PreferFP) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
   unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
   int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
   int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
   bool isFixed = MFI.isFixedObjectIndex(FI);
   bool isCSR = !isFixed && MFI.getObjectOffset(FI) >=
                                -((int)AFI->getCalleeSavedStackSize());
 
   // Use frame pointer to reference fixed objects. Use it for locals if
   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
   // reliable as a base). Make sure useFPForScavengingIndex() does the
   // right thing for the emergency spill slot.
   bool UseFP = false;
   if (AFI->hasStackFrame()) {
     // Note: Keeping the following as multiple 'if' statements rather than
     // merging to a single expression for readability.
     //
     // Argument access should always use the FP.
     if (isFixed) {
       UseFP = hasFP(MF);
     } else if (isCSR && RegInfo->needsStackRealignment(MF)) {
       // References to the CSR area must use FP if we're re-aligning the stack
       // since the dynamically-sized alignment padding is between the SP/BP and
       // the CSR area.
       assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
       UseFP = true;
     } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) {
       // If the FPOffset is negative, we have to keep in mind that the
       // available offset range for negative offsets is smaller than for
       // positive ones. If an offset is
       // available via the FP and the SP, use whichever is closest.
       bool FPOffsetFits = FPOffset >= -256;
       PreferFP |= Offset > -FPOffset;
 
       if (MFI.hasVarSizedObjects()) {
         // If we have variable sized objects, we can use either FP or BP, as the
         // SP offset is unknown. We can use the base pointer if we have one and
         // FP is not preferred. If not, we're stuck with using FP.
         bool CanUseBP = RegInfo->hasBasePointer(MF);
         if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
           UseFP = PreferFP;
         else if (!CanUseBP) // Can't use BP. Forced to use FP.
           UseFP = true;
         // else we can use BP and FP, but the offset from FP won't fit.
         // That will make us scavenge registers which we can probably avoid by
         // using BP. If it won't fit for BP either, we'll scavenge anyway.
       } else if (FPOffset >= 0) {
         // Use SP or FP, whichever gives us the best chance of the offset
         // being in range for direct access. If the FPOffset is positive,
         // that'll always be best, as the SP will be even further away.
         UseFP = true;
       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
         // Funclets access the locals contained in the parent's stack frame
         // via the frame pointer, so we have to use the FP in the parent
         // function.
         assert(
             Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()) &&
             "Funclets should only be present on Win64");
         UseFP = true;
       } else {
         // We have the choice between FP and (SP or BP).
         if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
           UseFP = true;
       }
     }
   }
 
   assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) &&
          "In the presence of dynamic stack pointer realignment, "
          "non-argument/CSR objects cannot be accessed through the frame pointer");
 
   if (UseFP) {
     FrameReg = RegInfo->getFrameRegister(MF);
     return FPOffset;
   }
 
   // Use the base pointer if we have one.
   if (RegInfo->hasBasePointer(MF))
     FrameReg = RegInfo->getBaseRegister();
   else {
     assert(!MFI.hasVarSizedObjects() &&
            "Can't use SP when we have var sized objects.");
     FrameReg = AArch64::SP;
     // If we're using the red zone for this function, the SP won't actually
     // be adjusted, so the offsets will be negative. They're also all
     // within range of the signed 9-bit immediate instructions.
     if (canUseRedZone(MF))
       Offset -= AFI->getLocalStackSize();
   }
 
   return Offset;
 }
 
 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
   // Do not set a kill flag on values that are also marked as live-in. This
   // happens with the @llvm-returnaddress intrinsic and with arguments passed in
   // callee saved registers.
   // Omitting the kill flags is conservatively correct even if the live-in
   // is not used after all.
   bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
   return getKillRegState(!IsLiveIn);
 }
 
 static bool produceCompactUnwindFrame(MachineFunction &MF) {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   AttributeList Attrs = MF.getFunction().getAttributes();
   return Subtarget.isTargetMachO() &&
          !(Subtarget.getTargetLowering()->supportSwiftError() &&
            Attrs.hasAttrSomewhere(Attribute::SwiftError));
 }
 
 static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
                                              bool NeedsWinCFI) {
   // If we are generating register pairs for a Windows function that requires
   // EH support, then pair consecutive registers only.  There are no unwind
   // opcodes for saves/restores of non-consectuve register pairs.
   // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x.
   // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
 
   // TODO: LR can be paired with any register.  We don't support this yet in
   // the MCLayer.  We need to add support for the save_lrpair unwind code.
   if (!NeedsWinCFI)
     return false;
   if (Reg2 == Reg1 + 1)
     return false;
   return true;
 }
 
 namespace {
 
 struct RegPairInfo {
   unsigned Reg1 = AArch64::NoRegister;
   unsigned Reg2 = AArch64::NoRegister;
   int FrameIdx;
   int Offset;
   enum RegType { GPR, FPR64, FPR128 } Type;
 
   RegPairInfo() = default;
 
   bool isPaired() const { return Reg2 != AArch64::NoRegister; }
 };
 
 } // end anonymous namespace
 
 static void computeCalleeSaveRegisterPairs(
     MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
     bool &NeedShadowCallStackProlog) {
 
   if (CSI.empty())
     return;
 
   bool NeedsWinCFI = needsWinCFI(MF);
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   CallingConv::ID CC = MF.getFunction().getCallingConv();
   unsigned Count = CSI.size();
   (void)CC;
   // MachO's compact unwind format relies on all registers being stored in
   // pairs.
   assert((!produceCompactUnwindFrame(MF) ||
           CC == CallingConv::PreserveMost ||
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
   int Offset = AFI->getCalleeSavedStackSize();
   // On Linux, we will have either one or zero non-paired register.  On Windows
   // with CFI, we can have multiple unpaired registers in order to utilize the
   // available unwind codes.  This flag assures that the alignment fixup is done
   // only once, as intened.
   bool FixupDone = false;
   for (unsigned i = 0; i < Count; ++i) {
     RegPairInfo RPI;
     RPI.Reg1 = CSI[i].getReg();
 
     if (AArch64::GPR64RegClass.contains(RPI.Reg1))
       RPI.Type = RegPairInfo::GPR;
     else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
       RPI.Type = RegPairInfo::FPR64;
     else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
       RPI.Type = RegPairInfo::FPR128;
     else
       llvm_unreachable("Unsupported register class.");
 
     // Add the next reg to the pair if it is in the same register class.
     if (i + 1 < Count) {
       unsigned NextReg = CSI[i + 1].getReg();
       switch (RPI.Type) {
       case RegPairInfo::GPR:
         if (AArch64::GPR64RegClass.contains(NextReg) &&
             !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR64:
         if (AArch64::FPR64RegClass.contains(NextReg) &&
             !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI))
           RPI.Reg2 = NextReg;
         break;
       case RegPairInfo::FPR128:
         if (AArch64::FPR128RegClass.contains(NextReg))
           RPI.Reg2 = NextReg;
         break;
       }
     }
 
     // If either of the registers to be saved is the lr register, it means that
     // we also need to save lr in the shadow call stack.
     if ((RPI.Reg1 == AArch64::LR || RPI.Reg2 == AArch64::LR) &&
         MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
       if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
         report_fatal_error("Must reserve x18 to use shadow call stack");
       NeedShadowCallStackProlog = true;
     }
 
     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
     // list to come in sorted by frame index so that we can issue the store
     // pair instructions directly. Assert if we see anything otherwise.
     //
     // The order of the registers in the list is controlled by
     // getCalleeSavedRegs(), so they will always be in-order, as well.
     assert((!RPI.isPaired() ||
             (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
            "Out of order callee saved regs!");
 
     // MachO's compact unwind format relies on all registers being stored in
     // adjacent register pairs.
     assert((!produceCompactUnwindFrame(MF) ||
             CC == CallingConv::PreserveMost ||
             (RPI.isPaired() &&
              ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
               RPI.Reg1 + 1 == RPI.Reg2))) &&
            "Callee-save registers not saved as adjacent register pair!");
 
     RPI.FrameIdx = CSI[i].getFrameIdx();
 
     int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8;
     Offset -= RPI.isPaired() ? 2 * Scale : Scale;
 
     // Round up size of non-pair to pair size if we need to pad the
     // callee-save area to ensure 16-byte alignment.
     if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
         RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) {
       FixupDone = true;
       Offset -= 8;
       assert(Offset % 16 == 0);
       assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
       MFI.setObjectAlignment(RPI.FrameIdx, 16);
     }
 
     assert(Offset % Scale == 0);
     RPI.Offset = Offset / Scale;
     assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
            "Offset out of bounds for LDP/STP immediate");
 
     RegPairs.push_back(RPI);
     if (RPI.isPaired())
       ++i;
   }
 }
 
 bool AArch64FrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   bool NeedsWinCFI = needsWinCFI(MF);
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
 
   bool NeedShadowCallStackProlog = false;
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
                                  NeedShadowCallStackProlog);
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   if (NeedShadowCallStackProlog) {
     // Shadow call stack prolog: str x30, [x18], #8
     BuildMI(MBB, MI, DL, TII.get(AArch64::STRXpost))
         .addReg(AArch64::X18, RegState::Define)
         .addReg(AArch64::LR)
         .addReg(AArch64::X18)
         .addImm(8)
         .setMIFlag(MachineInstr::FrameSetup);
 
     if (NeedsWinCFI)
       BuildMI(MBB, MI, DL, TII.get(AArch64::SEH_Nop))
           .setMIFlag(MachineInstr::FrameSetup);
 
     if (!MF.getFunction().hasFnAttribute(Attribute::NoUnwind)) {
       // Emit a CFI instruction that causes 8 to be subtracted from the value of
       // x18 when unwinding past this frame.
       static const char CFIInst[] = {
           dwarf::DW_CFA_val_expression,
           18, // register
           2,  // length
           static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
           static_cast<char>(-8) & 0x7f, // addend (sleb128)
       };
       unsigned CFIIndex =
           MF.addFrameInst(MCCFIInstruction::createEscape(nullptr, CFIInst));
       BuildMI(MBB, MI, DL, TII.get(AArch64::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
     // This instruction also makes x18 live-in to the entry block.
     MBB.addLiveIn(AArch64::X18);
   }
 
   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
        ++RPII) {
     RegPairInfo RPI = *RPII;
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
     unsigned StrOpc;
 
     // Issue sequence of spills for cs regs.  The first spill may be converted
     // to a pre-decrement store later by emitPrologue if the callee-save stack
     // area allocation can't be combined with the local stack area allocation.
     // For example:
     //    stp     x22, x21, [sp, #0]     // addImm(+0)
     //    stp     x20, x19, [sp, #16]    // addImm(+2)
     //    stp     fp, lr, [sp, #32]      // addImm(+4)
     // Rationale: This sequence saves uop updates compared to a sequence of
     // pre-increment spills like stp xi,xj,[sp,#-16]!
     // Note: Similar rationale and sequence for restores in epilog.
     unsigned Size, Align;
     switch (RPI.Type) {
     case RegPairInfo::GPR:
        StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
        Size = 8;
        Align = 8;
        break;
     case RegPairInfo::FPR64:
        StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
        Size = 8;
        Align = 8;
        break;
     case RegPairInfo::FPR128:
        StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
        Size = 16;
        Align = 16;
        break;
     }
     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
                dbgs() << ") -> fi#(" << RPI.FrameIdx;
                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
                dbgs() << ")\n");
 
     assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
            "Windows unwdinding requires a consecutive (FP,LR) pair");
     // Windows unwind codes require consecutive registers if registers are
     // paired.  Make the switch here, so that the code below will save (x,x+1)
     // and not (x+1,x).
     unsigned FrameIdxReg1 = RPI.FrameIdx;
     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
     if (NeedsWinCFI && RPI.isPaired()) {
       std::swap(Reg1, Reg2);
       std::swap(FrameIdxReg1, FrameIdxReg2);
     }
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
     if (!MRI.isReserved(Reg1))
       MBB.addLiveIn(Reg1);
     if (RPI.isPaired()) {
       if (!MRI.isReserved(Reg2))
         MBB.addLiveIn(Reg2);
       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOStore, Size, Align));
     }
     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
         .addReg(AArch64::SP)
         .addImm(RPI.Offset) // [sp, #offset*scale],
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameSetup);
     MIB.addMemOperand(MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF,FrameIdxReg1),
         MachineMemOperand::MOStore, Size, Align));
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameSetup);
 
   }
   return true;
 }
 
 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
   bool NeedsWinCFI = needsWinCFI(MF);
 
   if (MI != MBB.end())
     DL = MI->getDebugLoc();
 
   bool NeedShadowCallStackProlog = false;
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs,
                                  NeedShadowCallStackProlog);
 
   auto EmitMI = [&](const RegPairInfo &RPI) {
     unsigned Reg1 = RPI.Reg1;
     unsigned Reg2 = RPI.Reg2;
 
     // Issue sequence of restores for cs regs. The last restore may be converted
     // to a post-increment load later by emitEpilogue if the callee-save stack
     // area allocation can't be combined with the local stack area allocation.
     // For example:
     //    ldp     fp, lr, [sp, #32]       // addImm(+4)
     //    ldp     x20, x19, [sp, #16]     // addImm(+2)
     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
     // Note: see comment in spillCalleeSavedRegisters()
     unsigned LdrOpc;
     unsigned Size, Align;
     switch (RPI.Type) {
     case RegPairInfo::GPR:
        LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
        Size = 8;
        Align = 8;
        break;
     case RegPairInfo::FPR64:
        LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
        Size = 8;
        Align = 8;
        break;
     case RegPairInfo::FPR128:
        LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
        Size = 16;
        Align = 16;
        break;
     }
     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
                if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
                dbgs() << ") -> fi#(" << RPI.FrameIdx;
                if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
                dbgs() << ")\n");
 
     // Windows unwind codes require consecutive registers if registers are
     // paired.  Make the switch here, so that the code below will save (x,x+1)
     // and not (x+1,x).
     unsigned FrameIdxReg1 = RPI.FrameIdx;
     unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
     if (NeedsWinCFI && RPI.isPaired()) {
       std::swap(Reg1, Reg2);
       std::swap(FrameIdxReg1, FrameIdxReg2);
     }
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
     if (RPI.isPaired()) {
       MIB.addReg(Reg2, getDefRegState(true));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
           MachineMemOperand::MOLoad, Size, Align));
     }
     MIB.addReg(Reg1, getDefRegState(true))
         .addReg(AArch64::SP)
         .addImm(RPI.Offset) // [sp, #offset*scale]
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameDestroy);
     MIB.addMemOperand(MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
         MachineMemOperand::MOLoad, Size, Align));
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
   };
   if (ReverseCSRRestoreSeq)
     for (const RegPairInfo &RPI : reverse(RegPairs))
       EmitMI(RPI);
   else
     for (const RegPairInfo &RPI : RegPairs)
       EmitMI(RPI);
 
   if (NeedShadowCallStackProlog) {
     // Shadow call stack epilog: ldr x30, [x18, #-8]!
     BuildMI(MBB, MI, DL, TII.get(AArch64::LDRXpre))
         .addReg(AArch64::X18, RegState::Define)
         .addReg(AArch64::LR, RegState::Define)
         .addReg(AArch64::X18)
         .addImm(-8)
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 
   return true;
 }
 
 void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
                                                 BitVector &SavedRegs,
                                                 RegScavenger *RS) const {
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned UnspilledCSGPR = AArch64::NoRegister;
   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
 
   unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
                                 ? RegInfo->getBaseRegister()
                                 : (unsigned)AArch64::NoRegister;
 
   unsigned ExtraCSSpill = 0;
   // Figure out which callee-saved registers to save/restore.
   for (unsigned i = 0; CSRegs[i]; ++i) {
     const unsigned Reg = CSRegs[i];
 
     // Add the base pointer register to SavedRegs if it is callee-save.
     if (Reg == BasePointerReg)
       SavedRegs.set(Reg);
 
     bool RegUsed = SavedRegs.test(Reg);
     unsigned PairedReg = CSRegs[i ^ 1];
     if (!RegUsed) {
       if (AArch64::GPR64RegClass.contains(Reg) &&
           !RegInfo->isReservedReg(MF, Reg)) {
         UnspilledCSGPR = Reg;
         UnspilledCSGPRPaired = PairedReg;
       }
       continue;
     }
 
     // MachO's compact unwind format relies on all registers being stored in
     // pairs.
     // FIXME: the usual format is actually better if unwinding isn't needed.
     if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
         !SavedRegs.test(PairedReg)) {
       SavedRegs.set(PairedReg);
       if (AArch64::GPR64RegClass.contains(PairedReg) &&
           !RegInfo->isReservedReg(MF, PairedReg))
         ExtraCSSpill = PairedReg;
     }
   }
 
   // Calculates the callee saved stack size.
   unsigned CSStackSize = 0;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned Reg : SavedRegs.set_bits())
     CSStackSize += TRI->getRegSizeInBits(Reg, MRI) / 8;
 
   // Save number of saved regs, so we can easily update CSStackSize later.
   unsigned NumSavedRegs = SavedRegs.count();
 
   // The frame record needs to be created by saving the appropriate registers
   unsigned EstimatedStackSize = MFI.estimateStackSize(MF);
   if (hasFP(MF) ||
       windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
     SavedRegs.set(AArch64::FP);
     SavedRegs.set(AArch64::LR);
   }
 
   LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
              for (unsigned Reg
                   : SavedRegs.set_bits()) dbgs()
              << ' ' << printReg(Reg, RegInfo);
              dbgs() << "\n";);
 
   // If any callee-saved registers are used, the frame cannot be eliminated.
   bool CanEliminateFrame = SavedRegs.count() == 0;
 
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
   unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
   bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit;
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
     AFI->setHasStackFrame(true);
 
   // Estimate if we might need to scavenge a register at some point in order
   // to materialize a stack offset. If so, either spill one additional
   // callee-saved register or reserve a special spill slot to facilitate
   // register scavenging. If we already spilled an extra callee-saved register
   // above to keep the number of spills even, we don't need to do anything else
   // here.
   if (BigStack) {
     if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
       LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
                         << " to get a scratch register.\n");
       SavedRegs.set(UnspilledCSGPR);
       // MachO's compact unwind format relies on all registers being stored in
       // pairs, so if we need to spill one extra for BigStack, then we need to
       // store the pair.
       if (produceCompactUnwindFrame(MF))
         SavedRegs.set(UnspilledCSGPRPaired);
       ExtraCSSpill = UnspilledCSGPRPaired;
     }
 
     // If we didn't find an extra callee-saved register to spill, create
     // an emergency spill slot.
     if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
       const TargetRegisterClass &RC = AArch64::GPR64RegClass;
       unsigned Size = TRI->getSpillSize(RC);
       unsigned Align = TRI->getSpillAlignment(RC);
       int FI = MFI.CreateStackObject(Size, Align, false);
       RS->addScavengingFrameIndex(FI);
       LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
                         << " as the emergency spill slot.\n");
     }
   }
 
   // Adding the size of additional 64bit GPR saves.
   CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
   unsigned AlignedCSStackSize = alignTo(CSStackSize, 16);
   LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
                << EstimatedStackSize + AlignedCSStackSize
                << " bytes.\n");
 
   // Round up to register pair alignment to avoid additional SP adjustment
   // instructions.
   AFI->setCalleeSavedStackSize(AlignedCSStackSize);
   AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
 }
 
 bool AArch64FrameLowering::enableStackSlotScavenging(
     const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   return AFI->hasCalleeSaveStackFreeSpace();
 }
 
 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
   // If this function isn't doing Win64-style C++ EH, we don't need to do
   // anything.
   if (!MF.hasEHFunclets())
     return;
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
 
   MachineBasicBlock &MBB = MF.front();
   auto MBBI = MBB.begin();
   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
     ++MBBI;
 
-  if (MBBI->isTerminator())
-    return;
-
   // Create an UnwindHelp object.
   int UnwindHelpFI =
       MFI.CreateStackObject(/*size*/8, /*alignment*/16, false);
   EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
   // We need to store -2 into the UnwindHelp object at the start of the
   // function.
   DebugLoc DL;
-  RS->enterBasicBlock(MBB);
-  unsigned DstReg = RS->scavengeRegister(&AArch64::GPR64RegClass, MBBI, 0);
+  RS->enterBasicBlockEnd(MBB);
+  RS->backward(std::prev(MBBI));
+  unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
+  assert(DstReg && "There must be a free register after frame setup");
   BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
       .addReg(DstReg, getKillRegState(true))
       .addFrameIndex(UnwindHelpFI)
       .addImm(0);
 }
 
 /// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
 /// the update.  This is easily retrieved as it is exactly the offset that is set
 /// in processFunctionBeforeFrameFinalized.
 int AArch64FrameLowering::getFrameIndexReferencePreferSP(
     const MachineFunction &MF, int FI, unsigned &FrameReg,
     bool IgnoreSPUpdates) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
                     << MFI.getObjectOffset(FI) << "\n");
   FrameReg = AArch64::SP;
   return MFI.getObjectOffset(FI);
 }
 
 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
 /// the parent's frame pointer
 unsigned AArch64FrameLowering::getWinEHParentFrameOffset(
     const MachineFunction &MF) const {
   return 0;
 }
 
 /// Funclets only need to account for space for the callee saved registers,
 /// as the locals are accounted for in the parent's stack frame.
 unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
     const MachineFunction &MF) const {
   // This is the size of the pushed CSRs.
   unsigned CSSize =
       MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
   // This is the amount of stack a funclet needs to allocate.
   return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
                  getStackAlignment());
 }
Index: vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64TargetMachine.cpp	(revision 344764)
+++ vendor/llvm/dist-release_80/lib/Target/AArch64/AArch64TargetMachine.cpp	(revision 344765)
@@ -1,592 +1,592 @@
 //===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetMachine.h"
 #include "AArch64.h"
 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetTransformInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include <memory>
 #include <string>
 
 using namespace llvm;
 
 static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
                                 cl::desc("Enable the CCMP formation pass"),
                                 cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableCondBrTuning("aarch64-enable-cond-br-tune",
                        cl::desc("Enable the conditional branch tuning pass"),
                        cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
                                cl::desc("Enable the machine combiner pass"),
                                cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnableStPairSuppress("aarch64-enable-stp-suppress",
                                           cl::desc("Suppress STP for AArch64"),
                                           cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnableAdvSIMDScalar(
     "aarch64-enable-simd-scalar",
     cl::desc("Enable use of AdvSIMD scalar integer instructions"),
     cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     EnablePromoteConstant("aarch64-enable-promote-const",
                           cl::desc("Enable the promote constant pass"),
                           cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnableCollectLOH(
     "aarch64-enable-collect-loh",
     cl::desc("Enable the pass that emits the linker optimization hints (LOH)"),
     cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableDeadRegisterElimination("aarch64-enable-dead-defs", cl::Hidden,
                                   cl::desc("Enable the pass that removes dead"
                                            " definitons and replaces stores to"
                                            " them with stores to the zero"
                                            " register"),
                                   cl::init(true));
 
 static cl::opt<bool> EnableRedundantCopyElimination(
     "aarch64-enable-copyelim",
     cl::desc("Enable the redundant copy elimination pass"), cl::init(true),
     cl::Hidden);
 
 static cl::opt<bool> EnableLoadStoreOpt("aarch64-enable-ldst-opt",
                                         cl::desc("Enable the load/store pair"
                                                  " optimization pass"),
                                         cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnableAtomicTidy(
     "aarch64-enable-atomic-cfg-tidy", cl::Hidden,
     cl::desc("Run SimplifyCFG after expanding atomic operations"
              " to make use of cmpxchg flow-based information"),
     cl::init(true));
 
 static cl::opt<bool>
 EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
                         cl::desc("Run early if-conversion"),
                         cl::init(true));
 
 static cl::opt<bool>
     EnableCondOpt("aarch64-enable-condopt",
                   cl::desc("Enable the condition optimizer pass"),
                   cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
 EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
                 cl::desc("Work around Cortex-A53 erratum 835769"),
                 cl::init(false));
 
 static cl::opt<bool>
     EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
                  cl::desc("Enable optimizations on complex GEPs"),
                  cl::init(false));
 
 static cl::opt<bool>
     BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
                      cl::desc("Relax out of range conditional branches"));
 
 static cl::opt<bool> EnableCompressJumpTables(
     "aarch64-enable-compress-jump-tables", cl::Hidden, cl::init(true),
     cl::desc("Use smallest entry possible for jump tables"));
 
 // FIXME: Unify control over GlobalMerge.
 static cl::opt<cl::boolOrDefault>
     EnableGlobalMerge("aarch64-enable-global-merge", cl::Hidden,
                       cl::desc("Enable the global merge pass"));
 
 static cl::opt<bool>
     EnableLoopDataPrefetch("aarch64-enable-loop-data-prefetch", cl::Hidden,
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(true));
 
 static cl::opt<int> EnableGlobalISelAtO(
     "aarch64-enable-global-isel-at-O", cl::Hidden,
     cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
     cl::init(0));
 
 static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
                                          cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableBranchTargets("aarch64-enable-branch-targets", cl::Hidden,
                         cl::desc("Enable the AAcrh64 branch target pass"),
                         cl::init(true));
 
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
   RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
   RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target());
   auto PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
   initializeAArch64A53Fix835769Pass(*PR);
   initializeAArch64A57FPLoadBalancingPass(*PR);
   initializeAArch64AdvSIMDScalarPass(*PR);
   initializeAArch64BranchTargetsPass(*PR);
   initializeAArch64CollectLOHPass(*PR);
   initializeAArch64CompressJumpTablesPass(*PR);
   initializeAArch64ConditionalComparesPass(*PR);
   initializeAArch64ConditionOptimizerPass(*PR);
   initializeAArch64DeadRegisterDefinitionsPass(*PR);
   initializeAArch64ExpandPseudoPass(*PR);
   initializeAArch64LoadStoreOptPass(*PR);
   initializeAArch64SIMDInstrOptPass(*PR);
   initializeAArch64PreLegalizerCombinerPass(*PR);
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
   initializeFalkorHWPFFixPass(*PR);
   initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
   initializeAArch64SpeculationHardeningPass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
 // AArch64 Lowering public interface.
 //===----------------------------------------------------------------------===//
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO())
     return llvm::make_unique<AArch64_MachoTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
     return llvm::make_unique<AArch64_COFFTargetObjectFile>();
 
   return llvm::make_unique<AArch64_ELFTargetObjectFile>();
 }
 
 // Helper function to build a DataLayout string
 static std::string computeDataLayout(const Triple &TT,
                                      const MCTargetOptions &Options,
                                      bool LittleEndian) {
   if (Options.getABIName() == "ilp32")
     return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
   if (TT.isOSBinFormatCOFF())
     return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
     return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
   return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
 }
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
-  // AArch64 Darwin is always PIC.
-  if (TT.isOSDarwin())
+  // AArch64 Darwin and Windows are always PIC.
+  if (TT.isOSDarwin() || TT.isOSWindows())
     return Reloc::PIC_;
   // On ELF platforms the default static relocation model has a smart enough
   // linker to cope with referencing external symbols defined in a shared
   // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
   if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
     return Reloc::Static;
   return *RM;
 }
 
 static CodeModel::Model
 getEffectiveAArch64CodeModel(const Triple &TT, Optional<CodeModel::Model> CM,
                              bool JIT) {
   if (CM) {
     if (*CM != CodeModel::Small && *CM != CodeModel::Tiny &&
         *CM != CodeModel::Large) {
       if (!TT.isOSFuchsia())
         report_fatal_error(
             "Only small, tiny and large code models are allowed on AArch64");
       else if (*CM != CodeModel::Kernel)
         report_fatal_error("Only small, tiny, kernel, and large code models "
                            "are allowed on AArch64");
     } else if (*CM == CodeModel::Tiny && !TT.isOSBinFormatELF())
       report_fatal_error("tiny code model is only supported on ELF");
     return *CM;
   }
   // The default MCJIT memory managers make no guarantees about where they can
   // find an executable page; JITed code needs to be able to refer to globals
   // no matter how far away they are.
   if (JIT)
     return CodeModel::Large;
   return CodeModel::Small;
 }
 
 /// Create an AArch64 architecture model.
 ///
 AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
                                            Optional<Reloc::Model> RM,
                                            Optional<CodeModel::Model> CM,
                                            CodeGenOpt::Level OL, bool JIT,
                                            bool LittleEndian)
     : LLVMTargetMachine(T,
                         computeDataLayout(TT, Options.MCOptions, LittleEndian),
                         TT, CPU, FS, Options, getEffectiveRelocModel(TT, RM),
                         getEffectiveAArch64CodeModel(TT, CM, JIT), OL),
       TLOF(createTLOF(getTargetTriple())), isLittle(LittleEndian) {
   initAsmInfo();
 
   if (TT.isOSBinFormatMachO()) {
     this->Options.TrapUnreachable = true;
     this->Options.NoTrapAfterNoreturn = true;
   }
 
   if (getMCAsmInfo()->usesWindowsCFI()) {
     // Unwinding can get confused if the last instruction in an
     // exception-handling region (function, funclet, try block, etc.)
     // is a call.
     //
     // FIXME: We could elide the trap if the next instruction would be in
     // the same region anyway.
     this->Options.TrapUnreachable = true;
   }
 
   // Enable GlobalISel at or below EnableGlobalISelAt0.
   if (getOptLevel() <= EnableGlobalISelAtO) {
     setGlobalISel(true);
     setGlobalISelAbort(GlobalISelAbortMode::Disable);
   }
 
   // AArch64 supports the MachineOutliner.
   setMachineOutliner(true);
 
   // AArch64 supports default outlining behaviour.
   setSupportsDefaultOutlining(true);
 }
 
 AArch64TargetMachine::~AArch64TargetMachine() = default;
 
 const AArch64Subtarget *
 AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
   Attribute FSAttr = F.getFnAttribute("target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
                         : TargetCPU;
   std::string FS = !FSAttr.hasAttribute(Attribute::None)
                        ? FSAttr.getValueAsString().str()
                        : TargetFS;
 
   auto &I = SubtargetMap[CPU + FS];
   if (!I) {
     // This needs to be done before we create a new subtarget since any
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
                                             isLittle);
   }
   return I.get();
 }
 
 void AArch64leTargetMachine::anchor() { }
 
 AArch64leTargetMachine::AArch64leTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
     const TargetOptions &Options, Optional<Reloc::Model> RM,
     Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT)
     : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, true) {}
 
 void AArch64beTargetMachine::anchor() { }
 
 AArch64beTargetMachine::AArch64beTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
     const TargetOptions &Options, Optional<Reloc::Model> RM,
     Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT)
     : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, JIT, false) {}
 
 namespace {
 
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
 public:
   AArch64PassConfig(AArch64TargetMachine &TM, PassManagerBase &PM)
       : TargetPassConfig(TM, PM) {
     if (TM.getOptLevel() != CodeGenOpt::None)
       substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
   }
 
   AArch64TargetMachine &getAArch64TargetMachine() const {
     return getTM<AArch64TargetMachine>();
   }
 
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     if (ST.hasFusion())
       DAG->addMutation(createAArch64MacroFusionDAGMutation());
     return DAG;
   }
 
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
     const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
     if (ST.hasFusion()) {
       // Run the Macro Fusion after RA again since literals are expanded from
       // pseudos then (v. addPreSched2()).
       ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
       DAG->addMutation(createAArch64MacroFusionDAGMutation());
       return DAG;
     }
 
     return nullptr;
   }
 
   void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
   void addPreLegalizeMachineIR() override;
   bool addLegalizeMachineIR() override;
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
   bool addILPOpts() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
 
 } // end anonymous namespace
 
 TargetTransformInfo
 AArch64TargetMachine::getTargetTransformInfo(const Function &F) {
   return TargetTransformInfo(AArch64TTIImpl(this, F));
 }
 
 TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new AArch64PassConfig(*this, PM);
 }
 
 void AArch64PassConfig::addIRPasses() {
   // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
   // ourselves.
   addPass(createAtomicExpandPass());
 
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
   // ldrex/strex loops to simplify this, but it needs tidying up.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass(1, true, true, false, true));
 
   // Run LoopDataPrefetch
   //
   // Run this before LSR to remove the multiplies involved in computing the
   // pointer values N iterations ahead.
   if (TM->getOptLevel() != CodeGenOpt::None) {
     if (EnableLoopDataPrefetch)
       addPass(createLoopDataPrefetchPass());
     if (EnableFalkorHWPFFix)
       addPass(createFalkorMarkStridedAccessesPass());
   }
 
   TargetPassConfig::addIRPasses();
 
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedLoadCombinePass());
     addPass(createInterleavedAccessPass());
   }
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
     // multiple GEPs with single index.
     addPass(createSeparateConstOffsetFromGEPPass(true));
     // Call EarlyCSE pass to find and remove subexpressions in the lowered
     // result.
     addPass(createEarlyCSEPass());
     // Do loop invariant code motion in case part of the lowered result is
     // invariant.
     addPass(createLICMPass());
   }
 }
 
 // Pass Pipeline Configuration
 bool AArch64PassConfig::addPreISel() {
   // Run promote constant before global merge, so that the promoted constants
   // get a chance to be merged
   if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
     addPass(createAArch64PromoteConstantPass());
   // FIXME: On AArch64, this depends on the type.
   // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
   // and the offset has to be a multiple of the related size in bytes.
   if ((TM->getOptLevel() != CodeGenOpt::None &&
        EnableGlobalMerge == cl::BOU_UNSET) ||
       EnableGlobalMerge == cl::BOU_TRUE) {
     bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) &&
                                (EnableGlobalMerge == cl::BOU_UNSET);
     addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize));
   }
 
   return false;
 }
 
 bool AArch64PassConfig::addInstSelector() {
   addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
 
   // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
   // references to _TLS_MODULE_BASE_ as possible.
   if (TM->getTargetTriple().isOSBinFormatELF() &&
       getOptLevel() != CodeGenOpt::None)
     addPass(createAArch64CleanupLocalDynamicTLSPass());
 
   return false;
 }
 
 bool AArch64PassConfig::addIRTranslator() {
   addPass(new IRTranslator());
   return false;
 }
 
 void AArch64PassConfig::addPreLegalizeMachineIR() {
   addPass(createAArch64PreLegalizeCombiner());
 }
 
 bool AArch64PassConfig::addLegalizeMachineIR() {
   addPass(new Legalizer());
   return false;
 }
 
 bool AArch64PassConfig::addRegBankSelect() {
   addPass(new RegBankSelect());
   return false;
 }
 
 void AArch64PassConfig::addPreGlobalInstructionSelect() {
   // Workaround the deficiency of the fast register allocator.
   if (TM->getOptLevel() == CodeGenOpt::None)
     addPass(new Localizer());
 }
 
 bool AArch64PassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect());
   return false;
 }
 
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
   if (EnableCCMP)
     addPass(createAArch64ConditionalCompares());
   if (EnableMCR)
     addPass(&MachineCombinerID);
   if (EnableCondBrTuning)
     addPass(createAArch64CondBrTuning());
   if (EnableEarlyIfConversion)
     addPass(&EarlyIfConverterID);
   if (EnableStPairSuppress)
     addPass(createAArch64StorePairSuppressPass());
   addPass(createAArch64SIMDInstrOptPass());
   return true;
 }
 
 void AArch64PassConfig::addPreRegAlloc() {
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
     addPass(createAArch64DeadRegisterDefinitions());
 
   // Use AdvSIMD scalar instructions whenever profitable.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) {
     addPass(createAArch64AdvSIMDScalar());
     // The AdvSIMD pass may produce copies that can be rewritten to
     // be register coaleascer friendly.
     addPass(&PeepholeOptimizerID);
   }
 }
 
 void AArch64PassConfig::addPostRegAlloc() {
   // Remove redundant copy instructions.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination)
     addPass(createAArch64RedundantCopyEliminationPass());
 
   if (TM->getOptLevel() != CodeGenOpt::None && usingDefaultRegAlloc())
     // Improve performance for some FP/SIMD code for A57.
     addPass(createAArch64A57FPLoadBalancing());
 }
 
 void AArch64PassConfig::addPreSched2() {
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
   // Use load/store pair instructions when possible.
   if (TM->getOptLevel() != CodeGenOpt::None) {
     if (EnableLoadStoreOpt)
       addPass(createAArch64LoadStoreOptimizationPass());
   }
 
   // The AArch64SpeculationHardeningPass destroys dominator tree and natural
   // loop info, which is needed for the FalkorHWPFFixPass and also later on.
   // Therefore, run the AArch64SpeculationHardeningPass before the
   // FalkorHWPFFixPass to avoid recomputing dominator tree and natural loop
   // info.
   addPass(createAArch64SpeculationHardeningPass());
 
   if (TM->getOptLevel() != CodeGenOpt::None) {
     if (EnableFalkorHWPFFix)
       addPass(createFalkorHWPFFixPass());
   }
 }
 
 void AArch64PassConfig::addPreEmitPass() {
   // Machine Block Placement might have created new opportunities when run
   // at O3, where the Tail Duplication Threshold is set to 4 instructions.
   // Run the load/store optimizer once more.
   if (TM->getOptLevel() >= CodeGenOpt::Aggressive && EnableLoadStoreOpt)
     addPass(createAArch64LoadStoreOptimizationPass());
 
   if (EnableA53Fix835769)
     addPass(createAArch64A53Fix835769());
   // Relax conditional branch instructions if they're otherwise out of
   // range of their destination.
   if (BranchRelaxation)
     addPass(&BranchRelaxationPassID);
 
   if (EnableBranchTargets)
     addPass(createAArch64BranchTargetsPass());
 
   if (TM->getOptLevel() != CodeGenOpt::None && EnableCompressJumpTables)
     addPass(createAArch64CompressJumpTablesPass());
 
   if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
       TM->getTargetTriple().isOSBinFormatMachO())
     addPass(createAArch64CollectLOHPass());
 }
Index: vendor/llvm/dist-release_80/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
===================================================================
--- vendor/llvm/dist-release_80/lib/Target/WebAssembly/WebAssemblyInstrInteger.td	(revision 344764)
+++ vendor/llvm/dist-release_80/lib/Target/WebAssembly/WebAssemblyInstrInteger.td	(revision 344765)
@@ -1,131 +1,124 @@
 // WebAssemblyInstrInteger.td-WebAssembly Integer codegen -------*- tablegen -*-
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// WebAssembly Integer operand code-gen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
 multiclass UnaryInt<SDNode node, string name, bits<32> i32Inst,
                     bits<32> i64Inst> {
   defm _I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
                 [(set I32:$dst, (node I32:$src))],
                 !strconcat("i32.", !strconcat(name, "\t$dst, $src")),
                 !strconcat("i32.", name), i32Inst>;
   defm _I64 : I<(outs I64:$dst), (ins I64:$src), (outs), (ins),
                 [(set I64:$dst, (node I64:$src))],
                 !strconcat("i64.", !strconcat(name, "\t$dst, $src")),
                 !strconcat("i64.", name), i64Inst>;
 }
 multiclass BinaryInt<SDNode node, string name, bits<32> i32Inst,
                      bits<32> i64Inst> {
   defm _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), (outs), (ins),
                 [(set I32:$dst, (node I32:$lhs, I32:$rhs))],
                 !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
                 !strconcat("i32.", name), i32Inst>;
   defm _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs), (outs), (ins),
                 [(set I64:$dst, (node I64:$lhs, I64:$rhs))],
                 !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
                 !strconcat("i64.", name), i64Inst>;
 }
 multiclass ComparisonInt<CondCode cond, string name, bits<32> i32Inst, bits<32> i64Inst> {
   defm _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), (outs), (ins),
                 [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
                 !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs")),
                 !strconcat("i32.", name), i32Inst>;
   defm _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs), (outs), (ins),
                 [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))],
                 !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs")),
                 !strconcat("i64.", name), i64Inst>;
 }
 
 // The spaces after the names are for aesthetic purposes only, to make
 // operands line up vertically after tab expansion.
 let isCommutable = 1 in
 defm ADD : BinaryInt<add, "add ", 0x6a, 0x7c>;
 defm SUB : BinaryInt<sub, "sub ", 0x6b, 0x7d>;
 let isCommutable = 1 in
 defm MUL : BinaryInt<mul, "mul ", 0x6c, 0x7e>;
 // Divide and remainder trap on a zero denominator.
 let hasSideEffects = 1 in {
 defm DIV_S : BinaryInt<sdiv, "div_s", 0x6d, 0x7f>;
 defm DIV_U : BinaryInt<udiv, "div_u", 0x6e, 0x80>;
 defm REM_S : BinaryInt<srem, "rem_s", 0x6f, 0x81>;
 defm REM_U : BinaryInt<urem, "rem_u", 0x70, 0x82>;
 } // hasSideEffects = 1
 let isCommutable = 1 in {
 defm AND : BinaryInt<and, "and ", 0x71, 0x83>;
 defm OR : BinaryInt<or, "or  ", 0x72, 0x84>;
 defm XOR : BinaryInt<xor, "xor ", 0x73, 0x85>;
 } // isCommutable = 1
 defm SHL : BinaryInt<shl, "shl ", 0x74, 0x86>;
 defm SHR_S : BinaryInt<sra, "shr_s", 0x75, 0x87>;
 defm SHR_U : BinaryInt<srl, "shr_u", 0x76, 0x88>;
 defm ROTL : BinaryInt<rotl, "rotl", 0x77, 0x89>;
 defm ROTR : BinaryInt<rotr, "rotr", 0x78, 0x8a>;
 
 let isCommutable = 1 in {
 defm EQ : ComparisonInt<SETEQ, "eq  ", 0x46, 0x51>;
 defm NE : ComparisonInt<SETNE, "ne  ", 0x47, 0x52>;
 } // isCommutable = 1
 defm LT_S : ComparisonInt<SETLT,  "lt_s", 0x48, 0x53>;
 defm LT_U : ComparisonInt<SETULT, "lt_u", 0x49, 0x54>;
 defm GT_S : ComparisonInt<SETGT,  "gt_s", 0x4a, 0x55>;
 defm GT_U : ComparisonInt<SETUGT, "gt_u", 0x4b, 0x56>;
 defm LE_S : ComparisonInt<SETLE,  "le_s", 0x4c, 0x57>;
 defm LE_U : ComparisonInt<SETULE, "le_u", 0x4d, 0x58>;
 defm GE_S : ComparisonInt<SETGE,  "ge_s", 0x4e, 0x59>;
 defm GE_U : ComparisonInt<SETUGE, "ge_u", 0x4f, 0x5a>;
 
 defm CLZ : UnaryInt<ctlz, "clz ", 0x67, 0x79>;
 defm CTZ : UnaryInt<cttz, "ctz ", 0x68, 0x7a>;
 defm POPCNT : UnaryInt<ctpop, "popcnt", 0x69, 0x7b>;
 
 defm EQZ_I32 : I<(outs I32:$dst), (ins I32:$src), (outs), (ins),
                  [(set I32:$dst, (setcc I32:$src, 0, SETEQ))],
                  "i32.eqz \t$dst, $src", "i32.eqz", 0x45>;
 defm EQZ_I64 : I<(outs I32:$dst), (ins I64:$src), (outs), (ins),
                  [(set I32:$dst, (setcc I64:$src, 0, SETEQ))],
                  "i64.eqz \t$dst, $src", "i64.eqz", 0x50>;
 
 // Optimize away an explicit mask on a rotate count.
 def : Pat<(rotl I32:$lhs, (and I32:$rhs, 31)), (ROTL_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotr I32:$lhs, (and I32:$rhs, 31)), (ROTR_I32 I32:$lhs, I32:$rhs)>;
 def : Pat<(rotl I64:$lhs, (and I64:$rhs, 63)), (ROTL_I64 I64:$lhs, I64:$rhs)>;
 def : Pat<(rotr I64:$lhs, (and I64:$rhs, 63)), (ROTR_I64 I64:$lhs, I64:$rhs)>;
 
 defm SELECT_I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))],
                     "i32.select\t$dst, $lhs, $rhs, $cond", "i32.select", 0x1b>;
 defm SELECT_I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs, I32:$cond),
                     (outs), (ins),
                     [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))],
                     "i64.select\t$dst, $lhs, $rhs, $cond", "i64.select", 0x1b>;
 
 // ISD::SELECT requires its operand to conform to getBooleanContents, but
 // WebAssembly's select interprets any non-zero value as true, so we can fold
 // a setne with 0 into a select.
 def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs),
           (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
 def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs),
           (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
 
 // And again, this time with seteq instead of setne and the arms reversed.
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs),
           (SELECT_I32 I32:$rhs, I32:$lhs, I32:$cond)>;
 def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs),
           (SELECT_I64 I64:$rhs, I64:$lhs, I32:$cond)>;
-
-// The legalizer inserts an unnecessary `and 1` to make input conform
-// to getBooleanContents, which we can lower away.
-def : Pat<(select (i32 (and I32:$cond, 1)), I32:$lhs, I32:$rhs),
-          (SELECT_I32 I32:$lhs, I32:$rhs, I32:$cond)>;
-def : Pat<(select (i32 (and I32:$cond, 1)), I64:$lhs, I64:$rhs),
-          (SELECT_I64 I64:$lhs, I64:$rhs, I32:$cond)>;
Index: vendor/llvm/dist-release_80/lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- vendor/llvm/dist-release_80/lib/Target/X86/X86ISelDAGToDAG.cpp	(revision 344764)
+++ vendor/llvm/dist-release_80/lib/Target/X86/X86ISelDAGToDAG.cpp	(revision 344765)
@@ -1,4103 +1,4111 @@
 //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines a DAG pattern matching instruction selector for X86,
 // converting from a legalized dag to a X86 dag.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <stdint.h>
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-isel"
 
 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
 
 static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
     cl::desc("Enable setting constant bits to reduce size of mask immediates"),
     cl::Hidden);
 
 //===----------------------------------------------------------------------===//
 //                      Pattern Matcher Implementation
 //===----------------------------------------------------------------------===//
 
 namespace {
   /// This corresponds to X86AddressMode, but uses SDValue's instead of register
   /// numbers for the leaves of the matched tree.
   struct X86ISelAddressMode {
     enum {
       RegBase,
       FrameIndexBase
     } BaseType;
 
     // This is really a union, discriminated by BaseType!
     SDValue Base_Reg;
     int Base_FrameIndex;
 
     unsigned Scale;
     SDValue IndexReg;
     int32_t Disp;
     SDValue Segment;
     const GlobalValue *GV;
     const Constant *CP;
     const BlockAddress *BlockAddr;
     const char *ES;
     MCSymbol *MCSym;
     int JT;
     unsigned Align;    // CP alignment.
     unsigned char SymbolFlags;  // X86II::MO_*
 
     X86ISelAddressMode()
         : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
           Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
           MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
 
     bool hasSymbolicDisplacement() const {
       return GV != nullptr || CP != nullptr || ES != nullptr ||
              MCSym != nullptr || JT != -1 || BlockAddr != nullptr;
     }
 
     bool hasBaseOrIndexReg() const {
       return BaseType == FrameIndexBase ||
              IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
     }
 
     /// Return true if this addressing mode is already RIP-relative.
     bool isRIPRelative() const {
       if (BaseType != RegBase) return false;
       if (RegisterSDNode *RegNode =
             dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
         return RegNode->getReg() == X86::RIP;
       return false;
     }
 
     void setBaseReg(SDValue Reg) {
       BaseType = RegBase;
       Base_Reg = Reg;
     }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dump(SelectionDAG *DAG = nullptr) {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
       if (Base_Reg.getNode())
         Base_Reg.getNode()->dump(DAG);
       else
         dbgs() << "nul\n";
       if (BaseType == FrameIndexBase)
         dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
       dbgs() << " Scale " << Scale << '\n'
              << "IndexReg ";
       if (IndexReg.getNode())
         IndexReg.getNode()->dump(DAG);
       else
         dbgs() << "nul\n";
       dbgs() << " Disp " << Disp << '\n'
              << "GV ";
       if (GV)
         GV->dump();
       else
         dbgs() << "nul";
       dbgs() << " CP ";
       if (CP)
         CP->dump();
       else
         dbgs() << "nul";
       dbgs() << '\n'
              << "ES ";
       if (ES)
         dbgs() << ES;
       else
         dbgs() << "nul";
       dbgs() << " MCSym ";
       if (MCSym)
         dbgs() << MCSym;
       else
         dbgs() << "nul";
       dbgs() << " JT" << JT << " Align" << Align << '\n';
     }
 #endif
   };
 }
 
 namespace {
   //===--------------------------------------------------------------------===//
   /// ISel - X86-specific code to select X86 machine instructions for
   /// SelectionDAG operations.
   ///
   class X86DAGToDAGISel final : public SelectionDAGISel {
     /// Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
 
     /// If true, selector should try to optimize for code size instead of
     /// performance.
     bool OptForSize;
 
     /// If true, selector should try to optimize for minimum code size.
     bool OptForMinSize;
 
     /// Disable direct TLS access through segment registers.
     bool IndirectTlsSegRefs;
 
   public:
     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
         : SelectionDAGISel(tm, OptLevel), OptForSize(false),
           OptForMinSize(false) {}
 
     StringRef getPassName() const override {
       return "X86 DAG->DAG Instruction Selection";
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       // Reset the subtarget each time through.
       Subtarget = &MF.getSubtarget<X86Subtarget>();
       IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
                              "indirect-tls-seg-refs");
       SelectionDAGISel::runOnMachineFunction(MF);
       return true;
     }
 
     void EmitFunctionEntryCode() override;
 
     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
 
     void PreprocessISelDAG() override;
     void PostprocessISelDAG() override;
 
 // Include the pieces autogenerated from the target description.
 #include "X86GenDAGISel.inc"
 
   private:
     void Select(SDNode *N) override;
 
     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
     bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
     bool matchAddress(SDValue N, X86ISelAddressMode &AM);
     bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
     bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
     bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                  unsigned Depth);
     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                     SDValue &Scale, SDValue &Index, SDValue &Disp,
                     SDValue &Segment);
     bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
                           SDValue &Scale, SDValue &Index, SDValue &Disp,
                           SDValue &Segment);
     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
     bool selectLEAAddr(SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp,
                        SDValue &Segment);
     bool selectLEA64_32Addr(SDValue N, SDValue &Base,
                             SDValue &Scale, SDValue &Index, SDValue &Disp,
                             SDValue &Segment);
     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
                            SDValue &Scale, SDValue &Index, SDValue &Disp,
                            SDValue &Segment);
     bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
                              SDValue &Base, SDValue &Scale,
                              SDValue &Index, SDValue &Disp,
                              SDValue &Segment,
                              SDValue &NodeWithChain);
     bool selectRelocImm(SDValue N, SDValue &Op);
 
     bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
                      SDValue &Base, SDValue &Scale,
                      SDValue &Index, SDValue &Disp,
                      SDValue &Segment);
 
     // Convenience method where P is also root.
     bool tryFoldLoad(SDNode *P, SDValue N,
                      SDValue &Base, SDValue &Scale,
                      SDValue &Index, SDValue &Disp,
                      SDValue &Segment) {
       return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
     }
 
     /// Implement addressing mode selection for inline asm expressions.
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
                                       std::vector<SDValue> &OutOps) override;
 
     void emitSpecialCodeForMain();
 
     inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
                                    SDValue &Base, SDValue &Scale,
                                    SDValue &Index, SDValue &Disp,
                                    SDValue &Segment) {
       Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
                  ? CurDAG->getTargetFrameIndex(
                        AM.Base_FrameIndex,
                        TLI->getPointerTy(CurDAG->getDataLayout()))
                  : AM.Base_Reg;
       Scale = getI8Imm(AM.Scale, DL);
       Index = AM.IndexReg;
       // These are 32-bit even in 64-bit mode since RIP-relative offset
       // is 32-bit.
       if (AM.GV)
         Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
                                               MVT::i32, AM.Disp,
                                               AM.SymbolFlags);
       else if (AM.CP)
         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
                                              AM.Align, AM.Disp, AM.SymbolFlags);
       else if (AM.ES) {
         assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
       } else if (AM.MCSym) {
         assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
         assert(AM.SymbolFlags == 0 && "oo");
         Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
       } else if (AM.JT != -1) {
         assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
       } else if (AM.BlockAddr)
         Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
                                              AM.SymbolFlags);
       else
         Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
 
       if (AM.Segment.getNode())
         Segment = AM.Segment;
       else
         Segment = CurDAG->getRegister(0, MVT::i32);
     }
 
     // Utility function to determine whether we should avoid selecting
     // immediate forms of instructions for better code size or not.
     // At a high level, we'd like to avoid such instructions when
     // we have similar constants used within the same basic block
     // that can be kept in a register.
     //
     bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
       uint32_t UseCount = 0;
 
       // Do not want to hoist if we're not optimizing for size.
       // TODO: We'd like to remove this restriction.
       // See the comment in X86InstrInfo.td for more info.
       if (!OptForSize)
         return false;
 
       // Walk all the users of the immediate.
       for (SDNode::use_iterator UI = N->use_begin(),
            UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) {
 
         SDNode *User = *UI;
 
         // This user is already selected. Count it as a legitimate use and
         // move on.
         if (User->isMachineOpcode()) {
           UseCount++;
           continue;
         }
 
         // We want to count stores of immediates as real uses.
         if (User->getOpcode() == ISD::STORE &&
             User->getOperand(1).getNode() == N) {
           UseCount++;
           continue;
         }
 
         // We don't currently match users that have > 2 operands (except
         // for stores, which are handled above)
         // Those instruction won't match in ISEL, for now, and would
         // be counted incorrectly.
         // This may change in the future as we add additional instruction
         // types.
         if (User->getNumOperands() != 2)
           continue;
 
         // Immediates that are used for offsets as part of stack
         // manipulation should be left alone. These are typically
         // used to indicate SP offsets for argument passing and
         // will get pulled into stores/pushes (implicitly).
         if (User->getOpcode() == X86ISD::ADD ||
             User->getOpcode() == ISD::ADD    ||
             User->getOpcode() == X86ISD::SUB ||
             User->getOpcode() == ISD::SUB) {
 
           // Find the other operand of the add/sub.
           SDValue OtherOp = User->getOperand(0);
           if (OtherOp.getNode() == N)
             OtherOp = User->getOperand(1);
 
           // Don't count if the other operand is SP.
           RegisterSDNode *RegNode;
           if (OtherOp->getOpcode() == ISD::CopyFromReg &&
               (RegNode = dyn_cast_or_null<RegisterSDNode>(
                  OtherOp->getOperand(1).getNode())))
             if ((RegNode->getReg() == X86::ESP) ||
                 (RegNode->getReg() == X86::RSP))
               continue;
         }
 
         // ... otherwise, count this and move on.
         UseCount++;
       }
 
       // If we have more than 1 use, then recommend for hoisting.
       return (UseCount > 1);
     }
 
     /// Return a target constant with the specified value of type i8.
     inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
       return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
     }
 
     /// Return a target constant with the specified value, of type i32.
     inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
       return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
     }
 
     /// Return a target constant with the specified value, of type i64.
     inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
       return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
     }
 
     SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
                                         const SDLoc &DL) {
       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
       uint64_t Index = N->getConstantOperandVal(1);
       MVT VecVT = N->getOperand(0).getSimpleValueType();
       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
     }
 
     SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
                                       const SDLoc &DL) {
       assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
       uint64_t Index = N->getConstantOperandVal(2);
       MVT VecVT = N->getSimpleValueType(0);
       return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
     }
 
     /// Return an SDNode that returns the value of the global base register.
     /// Output instructions required to initialize the global base register,
     /// if necessary.
     SDNode *getGlobalBaseReg();
 
     /// Return a reference to the TargetMachine, casted to the target-specific
     /// type.
     const X86TargetMachine &getTargetMachine() const {
       return static_cast<const X86TargetMachine &>(TM);
     }
 
     /// Return a reference to the TargetInstrInfo, casted to the target-specific
     /// type.
     const X86InstrInfo *getInstrInfo() const {
       return Subtarget->getInstrInfo();
     }
 
     /// Address-mode matching performs shift-of-and to and-of-shift
     /// reassociation in order to expose more scaled addressing
     /// opportunities.
     bool ComplexPatternFuncMutatesDAG() const override {
       return true;
     }
 
     bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
 
     /// Returns whether this is a relocatable immediate in the range
     /// [-2^Width .. 2^Width-1].
     template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
       if (auto *CN = dyn_cast<ConstantSDNode>(N))
         return isInt<Width>(CN->getSExtValue());
       return isSExtAbsoluteSymbolRef(Width, N);
     }
 
     // Indicates we should prefer to use a non-temporal load for this load.
     bool useNonTemporalLoad(LoadSDNode *N) const {
       if (!N->isNonTemporal())
         return false;
 
       unsigned StoreSize = N->getMemoryVT().getStoreSize();
 
       if (N->getAlignment() < StoreSize)
         return false;
 
       switch (StoreSize) {
       default: llvm_unreachable("Unsupported store size");
       case 4:
       case 8:
         return false;
       case 16:
         return Subtarget->hasSSE41();
       case 32:
         return Subtarget->hasAVX2();
       case 64:
         return Subtarget->hasAVX512();
       }
     }
 
     bool foldLoadStoreIntoMemOperand(SDNode *Node);
     MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
     bool matchBitExtract(SDNode *Node);
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
 
     MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
                                 const SDLoc &dl, MVT VT, SDNode *Node);
     MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
                                 const SDLoc &dl, MVT VT, SDNode *Node,
                                 SDValue &InFlag);
 
     bool tryOptimizeRem8Extend(SDNode *N);
 
     bool onlyUsesZeroFlag(SDValue Flags) const;
     bool hasNoSignFlagUses(SDValue Flags) const;
     bool hasNoCarryFlagUses(SDValue Flags) const;
   };
 }
 
 
 // Returns true if this masked compare can be implemented legally with this
 // type.
 static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
   unsigned Opcode = N->getOpcode();
   if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
       Opcode == X86ISD::CMPM_RND || Opcode == X86ISD::VFPCLASS) {
     // We can get 256-bit 8 element types here without VLX being enabled. When
     // this happens we will use 512-bit operations and the mask will not be
     // zero extended.
     EVT OpVT = N->getOperand(0).getValueType();
     if (OpVT.is256BitVector() || OpVT.is128BitVector())
       return Subtarget->hasVLX();
 
     return true;
   }
   // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
   if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM ||
       Opcode == X86ISD::FSETCCM_RND)
     return true;
 
   return false;
 }
 
 // Returns true if we can assume the writer of the mask has zero extended it
 // for us.
 bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
   // If this is an AND, check if we have a compare on either side. As long as
   // one side guarantees the mask is zero extended, the AND will preserve those
   // zeros.
   if (N->getOpcode() == ISD::AND)
     return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
            isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
 
   return isLegalMaskCompare(N, Subtarget);
 }
 
 bool
 X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
   if (OptLevel == CodeGenOpt::None) return false;
 
   if (!N.hasOneUse())
     return false;
 
   if (N.getOpcode() != ISD::LOAD)
     return true;
 
   // Don't fold non-temporal loads if we have an instruction for them.
   if (useNonTemporalLoad(cast<LoadSDNode>(N)))
     return false;
 
   // If N is a load, do additional profitability checks.
   if (U == Root) {
     switch (U->getOpcode()) {
     default: break;
     case X86ISD::ADD:
     case X86ISD::ADC:
     case X86ISD::SUB:
     case X86ISD::SBB:
     case X86ISD::AND:
     case X86ISD::XOR:
     case X86ISD::OR:
     case ISD::ADD:
     case ISD::ADDCARRY:
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR: {
       SDValue Op1 = U->getOperand(1);
 
       // If the other operand is a 8-bit immediate we should fold the immediate
       // instead. This reduces code size.
       // e.g.
       // movl 4(%esp), %eax
       // addl $4, %eax
       // vs.
       // movl $4, %eax
       // addl 4(%esp), %eax
       // The former is 2 bytes shorter. In case where the increment is 1, then
       // the saving can be 4 bytes (by using incl %eax).
       if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
         if (Imm->getAPIntValue().isSignedIntN(8))
           return false;
 
         // If this is a 64-bit AND with an immediate that fits in 32-bits,
         // prefer using the smaller and over folding the load. This is needed to
         // make sure immediates created by shrinkAndImmediate are always folded.
         // Ideally we would narrow the load during DAG combine and get the
         // best of both worlds.
         if (U->getOpcode() == ISD::AND &&
             Imm->getAPIntValue().getBitWidth() == 64 &&
             Imm->getAPIntValue().isIntN(32))
           return false;
       }
 
       // If the other operand is a TLS address, we should fold it instead.
       // This produces
       // movl    %gs:0, %eax
       // leal    i@NTPOFF(%eax), %eax
       // instead of
       // movl    $i@NTPOFF, %eax
       // addl    %gs:0, %eax
       // if the block also has an access to a second TLS address this will save
       // a load.
       // FIXME: This is probably also true for non-TLS addresses.
       if (Op1.getOpcode() == X86ISD::Wrapper) {
         SDValue Val = Op1.getOperand(0);
         if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
           return false;
       }
 
       // Don't fold load if this matches the BTS/BTR/BTC patterns.
       // BTS: (or X, (shl 1, n))
       // BTR: (and X, (rotl -2, n))
       // BTC: (xor X, (shl 1, n))
       if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) {
         if (U->getOperand(0).getOpcode() == ISD::SHL &&
             isOneConstant(U->getOperand(0).getOperand(0)))
           return false;
 
         if (U->getOperand(1).getOpcode() == ISD::SHL &&
             isOneConstant(U->getOperand(1).getOperand(0)))
           return false;
       }
       if (U->getOpcode() == ISD::AND) {
         SDValue U0 = U->getOperand(0);
         SDValue U1 = U->getOperand(1);
         if (U0.getOpcode() == ISD::ROTL) {
           auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
           if (C && C->getSExtValue() == -2)
             return false;
         }
 
         if (U1.getOpcode() == ISD::ROTL) {
           auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
           if (C && C->getSExtValue() == -2)
             return false;
         }
       }
 
       break;
     }
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
       // Don't fold a load into a shift by immediate. The BMI2 instructions
       // support folding a load, but not an immediate. The legacy instructions
       // support folding an immediate, but can't fold a load. Folding an
       // immediate is preferable to folding a load.
       if (isa<ConstantSDNode>(U->getOperand(1)))
         return false;
 
       break;
     }
   }
 
   // Prevent folding a load if this can implemented with an insert_subreg or
   // a move that implicitly zeroes.
   if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
       isNullConstant(Root->getOperand(2)) &&
       (Root->getOperand(0).isUndef() ||
        ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())))
     return false;
 
   return true;
 }
 
 /// Replace the original chain operand of the call with
 /// load's chain operand and move load below the call's chain operand.
 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
                                SDValue Call, SDValue OrigChain) {
   SmallVector<SDValue, 8> Ops;
   SDValue Chain = OrigChain.getOperand(0);
   if (Chain.getNode() == Load.getNode())
     Ops.push_back(Load.getOperand(0));
   else {
     assert(Chain.getOpcode() == ISD::TokenFactor &&
            "Unexpected chain operand");
     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
       if (Chain.getOperand(i).getNode() == Load.getNode())
         Ops.push_back(Load.getOperand(0));
       else
         Ops.push_back(Chain.getOperand(i));
     SDValue NewChain =
       CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
     Ops.clear();
     Ops.push_back(NewChain);
   }
   Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
   CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
                              Load.getOperand(1), Load.getOperand(2));
 
   Ops.clear();
   Ops.push_back(SDValue(Load.getNode(), 1));
   Ops.append(Call->op_begin() + 1, Call->op_end());
   CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
 }
 
 /// Return true if call address is a load and it can be
 /// moved below CALLSEQ_START and the chains leading up to the call.
 /// Return the CALLSEQ_START by reference as a second output.
 /// In the case of a tail call, there isn't a callseq node between the call
 /// chain and the load.
 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
   // The transformation is somewhat dangerous if the call's chain was glued to
   // the call. After MoveBelowOrigChain the load is moved between the call and
   // the chain, this can create a cycle if the load is not folded. So it is
   // *really* important that we are sure the load will be folded.
   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
     return false;
   LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
   if (!LD ||
       LD->isVolatile() ||
       LD->getAddressingMode() != ISD::UNINDEXED ||
       LD->getExtensionType() != ISD::NON_EXTLOAD)
     return false;
 
   // Now let's find the callseq_start.
   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
     if (!Chain.hasOneUse())
       return false;
     Chain = Chain.getOperand(0);
   }
 
   if (!Chain.getNumOperands())
     return false;
   // Since we are not checking for AA here, conservatively abort if the chain
   // writes to memory. It's not safe to move the callee (a load) across a store.
   if (isa<MemSDNode>(Chain.getNode()) &&
       cast<MemSDNode>(Chain.getNode())->writeMem())
     return false;
   if (Chain.getOperand(0).getNode() == Callee.getNode())
     return true;
   if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
       Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
       Callee.getValue(1).hasOneUse())
     return true;
   return false;
 }
 
 void X86DAGToDAGISel::PreprocessISelDAG() {
   // OptFor[Min]Size are used in pattern predicates that isel is matching.
   OptForSize = MF->getFunction().optForSize();
   OptForMinSize = MF->getFunction().optForMinSize();
   assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
 
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
 
     // If this is a target specific AND node with no flag usages, turn it back
     // into ISD::AND to enable test instruction matching.
     if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) {
       SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
                                     N->getOperand(0), N->getOperand(1));
       --I;
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
       ++I;
       CurDAG->DeleteNode(N);
       continue;
     }
 
     if (OptLevel != CodeGenOpt::None &&
         // Only do this when the target can fold the load into the call or
         // jmp.
         !Subtarget->useRetpolineIndirectCalls() &&
         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
          (N->getOpcode() == X86ISD::TC_RETURN &&
           (Subtarget->is64Bit() ||
            !getTargetMachine().isPositionIndependent())))) {
       /// Also try moving call address load from outside callseq_start to just
       /// before the call to allow it to be folded.
       ///
       ///     [Load chain]
       ///         ^
       ///         |
       ///       [Load]
       ///       ^    ^
       ///       |    |
       ///      /      \--
       ///     /          |
       ///[CALLSEQ_START] |
       ///     ^          |
       ///     |          |
       /// [LOAD/C2Reg]   |
       ///     |          |
       ///      \        /
       ///       \      /
       ///       [CALL]
       bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
       SDValue Chain = N->getOperand(0);
       SDValue Load  = N->getOperand(1);
       if (!isCalleeLoad(Load, Chain, HasCallSeq))
         continue;
       moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
       ++NumLoadMoved;
       continue;
     }
 
     // Lower fpround and fpextend nodes that target the FP stack to be store and
     // load to the stack.  This is a gross hack.  We would like to simply mark
     // these as being illegal, but when we do that, legalize produces these when
     // it expands calls, then expands these in the same legalize pass.  We would
     // like dag combine to be able to hack on these between the call expansion
     // and the node legalization.  As such this pass basically does "really
     // late" legalization of these inline with the X86 isel pass.
     // FIXME: This should only happen when not compiled with -O0.
     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
       continue;
 
     MVT SrcVT = N->getOperand(0).getSimpleValueType();
     MVT DstVT = N->getSimpleValueType(0);
 
     // If any of the sources are vectors, no fp stack involved.
     if (SrcVT.isVector() || DstVT.isVector())
       continue;
 
     // If the source and destination are SSE registers, then this is a legal
     // conversion that should not be lowered.
     const X86TargetLowering *X86Lowering =
         static_cast<const X86TargetLowering *>(TLI);
     bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
     bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
     if (SrcIsSSE && DstIsSSE)
       continue;
 
     if (!SrcIsSSE && !DstIsSSE) {
       // If this is an FPStack extension, it is a noop.
       if (N->getOpcode() == ISD::FP_EXTEND)
         continue;
       // If this is a value-preserving FPStack truncation, it is a noop.
       if (N->getConstantOperandVal(1))
         continue;
     }
 
     // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
     // FPStack has extload and truncstore.  SSE can fold direct loads into other
     // operations.  Based on this, decide what we want to do.
     MVT MemVT;
     if (N->getOpcode() == ISD::FP_ROUND)
       MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
     else
       MemVT = SrcIsSSE ? SrcVT : DstVT;
 
     SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
     SDLoc dl(N);
 
     // FIXME: optimize the case where the src/dest is a load or store?
     SDValue Store =
         CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
                               MemTmp, MachinePointerInfo(), MemVT);
     SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
                                         MachinePointerInfo(), MemVT);
 
     // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
     // extload we created.  This will cause general havok on the dag because
     // anything below the conversion could be folded into other existing nodes.
     // To avoid invalidating 'I', back it up to the convert node.
     --I;
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
 
     // Now that we did that, the node is dead.  Increment the iterator to the
     // next node to process, then delete N.
     ++I;
     CurDAG->DeleteNode(N);
   }
 }
 
 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
 bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
   unsigned Opc = N->getMachineOpcode();
   if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 &&
       Opc != X86::MOVSX64rr8)
     return false;
 
   SDValue N0 = N->getOperand(0);
 
   // We need to be extracting the lower bit of an extend.
   if (!N0.isMachineOpcode() ||
       N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG ||
       N0.getConstantOperandVal(1) != X86::sub_8bit)
     return false;
 
   // We're looking for either a movsx or movzx to match the original opcode.
   unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX
                                                 : X86::MOVSX32rr8_NOREX;
   SDValue N00 = N0.getOperand(0);
   if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc)
     return false;
 
   if (Opc == X86::MOVSX64rr8) {
     // If we had a sign extend from 8 to 64 bits. We still need to go from 32
     // to 64.
     MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
                                                    MVT::i64, N00);
     ReplaceUses(N, Extend);
   } else {
     // Ok we can drop this extend and just use the original extend.
     ReplaceUses(N, N00.getNode());
   }
 
   return true;
 }
 
 void X86DAGToDAGISel::PostprocessISelDAG() {
   // Skip peepholes at -O0.
   if (TM.getOptLevel() == CodeGenOpt::None)
     return;
 
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
   bool MadeChange = false;
   while (Position != CurDAG->allnodes_begin()) {
     SDNode *N = &*--Position;
     // Skip dead nodes and any non-machine opcodes.
     if (N->use_empty() || !N->isMachineOpcode())
       continue;
 
     if (tryOptimizeRem8Extend(N)) {
       MadeChange = true;
       continue;
     }
 
     // Look for a TESTrr+ANDrr pattern where both operands of the test are
     // the same. Rewrite to remove the AND.
     unsigned Opc = N->getMachineOpcode();
     if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr ||
          Opc == X86::TEST32rr || Opc == X86::TEST64rr) &&
         N->getOperand(0) == N->getOperand(1) &&
         N->isOnlyUserOf(N->getOperand(0).getNode()) &&
         N->getOperand(0).isMachineOpcode()) {
       SDValue And = N->getOperand(0);
       unsigned N0Opc = And.getMachineOpcode();
       if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr ||
           N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) {
         MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
                                                      MVT::i32,
                                                      And.getOperand(0),
                                                      And.getOperand(1));
         ReplaceUses(N, Test);
         MadeChange = true;
         continue;
       }
       if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm ||
           N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) {
         unsigned NewOpc;
         switch (N0Opc) {
         case X86::AND8rm:  NewOpc = X86::TEST8mr; break;
         case X86::AND16rm: NewOpc = X86::TEST16mr; break;
         case X86::AND32rm: NewOpc = X86::TEST32mr; break;
         case X86::AND64rm: NewOpc = X86::TEST64mr; break;
         }
 
         // Need to swap the memory and register operand.
         SDValue Ops[] = { And.getOperand(1),
                           And.getOperand(2),
                           And.getOperand(3),
                           And.getOperand(4),
                           And.getOperand(5),
                           And.getOperand(0),
                           And.getOperand(6)  /* Chain */ };
         MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
                                                      MVT::i32, MVT::Other, Ops);
         ReplaceUses(N, Test);
         MadeChange = true;
         continue;
       }
     }
 
     // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
     // used. We're doing this late so we can prefer to fold the AND into masked
     // comparisons. Doing that can be better for the live range of the mask
     // register.
     if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr ||
          Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) &&
         N->getOperand(0) == N->getOperand(1) &&
         N->isOnlyUserOf(N->getOperand(0).getNode()) &&
         N->getOperand(0).isMachineOpcode() &&
         onlyUsesZeroFlag(SDValue(N, 0))) {
       SDValue And = N->getOperand(0);
       unsigned N0Opc = And.getMachineOpcode();
       // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
       // KAND instructions and KTEST use the same ISA feature.
       if (N0Opc == X86::KANDBrr ||
           (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) ||
           N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) {
         unsigned NewOpc;
         switch (Opc) {
         default: llvm_unreachable("Unexpected opcode!");
         case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break;
         case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break;
         case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break;
         case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break;
         }
         MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
                                                       MVT::i32,
                                                       And.getOperand(0),
                                                       And.getOperand(1));
         ReplaceUses(N, KTest);
         MadeChange = true;
         continue;
       }
     }
 
     // Attempt to remove vectors moves that were inserted to zero upper bits.
     if (Opc != TargetOpcode::SUBREG_TO_REG)
       continue;
 
     unsigned SubRegIdx = N->getConstantOperandVal(2);
     if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm)
       continue;
 
     SDValue Move = N->getOperand(1);
     if (!Move.isMachineOpcode())
       continue;
 
     // Make sure its one of the move opcodes we recognize.
     switch (Move.getMachineOpcode()) {
     default:
       continue;
     case X86::VMOVAPDrr:       case X86::VMOVUPDrr:
     case X86::VMOVAPSrr:       case X86::VMOVUPSrr:
     case X86::VMOVDQArr:       case X86::VMOVDQUrr:
     case X86::VMOVAPDYrr:      case X86::VMOVUPDYrr:
     case X86::VMOVAPSYrr:      case X86::VMOVUPSYrr:
     case X86::VMOVDQAYrr:      case X86::VMOVDQUYrr:
     case X86::VMOVAPDZ128rr:   case X86::VMOVUPDZ128rr:
     case X86::VMOVAPSZ128rr:   case X86::VMOVUPSZ128rr:
     case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
     case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
     case X86::VMOVAPDZ256rr:   case X86::VMOVUPDZ256rr:
     case X86::VMOVAPSZ256rr:   case X86::VMOVUPSZ256rr:
     case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
     case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
       break;
     }
 
     SDValue In = Move.getOperand(0);
     if (!In.isMachineOpcode() ||
         In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END)
       continue;
 
     // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
     // the SHA instructions which use a legacy encoding.
     uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
     if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
         (TSFlags & X86II::EncodingMask) != X86II::EVEX &&
         (TSFlags & X86II::EncodingMask) != X86II::XOP)
       continue;
 
     // Producing instruction is another vector instruction. We can drop the
     // move.
     CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
     MadeChange = true;
   }
 
   if (MadeChange)
     CurDAG->RemoveDeadNodes();
 }
 
 
 /// Emit any code that needs to be executed only in the main function.
 void X86DAGToDAGISel::emitSpecialCodeForMain() {
   if (Subtarget->isTargetCygMing()) {
     TargetLowering::ArgListTy Args;
     auto &DL = CurDAG->getDataLayout();
 
     TargetLowering::CallLoweringInfo CLI(*CurDAG);
     CLI.setChain(CurDAG->getRoot())
         .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
                    CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
                    std::move(Args));
     const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
     std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
     CurDAG->setRoot(Result.second);
   }
 }
 
 void X86DAGToDAGISel::EmitFunctionEntryCode() {
   // If this is main, emit special code for main.
   const Function &F = MF->getFunction();
   if (F.hasExternalLinkage() && F.getName() == "main")
     emitSpecialCodeForMain();
 }
 
 static bool isDispSafeForFrameIndex(int64_t Val) {
   // On 64-bit platforms, we can run into an issue where a frame index
   // includes a displacement that, when added to the explicit displacement,
   // will overflow the displacement field. Assuming that the frame index
   // displacement fits into a 31-bit integer  (which is only slightly more
   // aggressive than the current fundamental assumption that it fits into
   // a 32-bit integer), a 31-bit disp should always be safe.
   return isInt<31>(Val);
 }
 
 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
                                             X86ISelAddressMode &AM) {
   // If there's no offset to fold, we don't need to do any work.
   if (Offset == 0)
     return false;
 
   // Cannot combine ExternalSymbol displacements with integer offsets.
   if (AM.ES || AM.MCSym)
     return true;
 
   int64_t Val = AM.Disp + Offset;
   CodeModel::Model M = TM.getCodeModel();
   if (Subtarget->is64Bit()) {
     if (!X86::isOffsetSuitableForCodeModel(Val, M,
                                            AM.hasSymbolicDisplacement()))
       return true;
     // In addition to the checks required for a register base, check that
     // we do not try to use an unsafe Disp with a frame index.
     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
         !isDispSafeForFrameIndex(Val))
       return true;
   }
   AM.Disp = Val;
   return false;
 
 }
 
 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   SDValue Address = N->getOperand(1);
 
   // load gs:0 -> GS segment register.
   // load fs:0 -> FS segment register.
   //
   // This optimization is valid because the GNU TLS model defines that
   // gs:0 (or fs:0 on X86-64) contains its own address.
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
         !IndirectTlsSegRefs &&
         (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
          Subtarget->isTargetFuchsia()))
       switch (N->getPointerInfo().getAddrSpace()) {
       case 256:
         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
         return false;
       case 257:
         AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
         return false;
       // Address space 258 is not handled here, because it is not used to
       // address TLS areas.
       }
 
   return true;
 }
 
 /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
 /// mode. These wrap things that will resolve down into a symbol reference.
 /// If no match is possible, this returns true, otherwise it returns false.
 bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
   // If the addressing mode already has a symbol as the displacement, we can
   // never match another symbol.
   if (AM.hasSymbolicDisplacement())
     return true;
 
+  bool IsRIPRelTLS = false;
   bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
+  if (IsRIPRel) {
+    SDValue Val = N.getOperand(0);
+    if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+      IsRIPRelTLS = true;
+  }
 
-  // We can't use an addressing mode in the 64-bit large code model. In the
-  // medium code model, we use can use an mode when RIP wrappers are present.
-  // That signifies access to globals that are known to be "near", such as the
-  // GOT itself.
+  // We can't use an addressing mode in the 64-bit large code model.
+  // Global TLS addressing is an exception. In the medium code model,
+  // we use can use a mode when RIP wrappers are present.
+  // That signifies access to globals that are known to be "near",
+  // such as the GOT itself.
   CodeModel::Model M = TM.getCodeModel();
   if (Subtarget->is64Bit() &&
-      (M == CodeModel::Large || (M == CodeModel::Medium && !IsRIPRel)))
+      ((M == CodeModel::Large && !IsRIPRelTLS) ||
+       (M == CodeModel::Medium && !IsRIPRel)))
     return true;
 
   // Base and index reg must be 0 in order to use %rip as base.
   if (IsRIPRel && AM.hasBaseOrIndexReg())
     return true;
 
   // Make a local copy in case we can't do this fold.
   X86ISelAddressMode Backup = AM;
 
   int64_t Offset = 0;
   SDValue N0 = N.getOperand(0);
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
     AM.GV = G->getGlobal();
     AM.SymbolFlags = G->getTargetFlags();
     Offset = G->getOffset();
   } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
     AM.CP = CP->getConstVal();
     AM.Align = CP->getAlignment();
     AM.SymbolFlags = CP->getTargetFlags();
     Offset = CP->getOffset();
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
     AM.ES = S->getSymbol();
     AM.SymbolFlags = S->getTargetFlags();
   } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
     AM.MCSym = S->getMCSymbol();
   } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
     AM.JT = J->getIndex();
     AM.SymbolFlags = J->getTargetFlags();
   } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
     AM.BlockAddr = BA->getBlockAddress();
     AM.SymbolFlags = BA->getTargetFlags();
     Offset = BA->getOffset();
   } else
     llvm_unreachable("Unhandled symbol reference node.");
 
   if (foldOffsetIntoAddress(Offset, AM)) {
     AM = Backup;
     return true;
   }
 
   if (IsRIPRel)
     AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
 
   // Commit the changes now that we know this fold is safe.
   return false;
 }
 
 /// Add the specified node to the specified addressing mode, returning true if
 /// it cannot be done. This just pattern matches for the addressing mode.
 bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
   if (matchAddressRecursively(N, AM, 0))
     return true;
 
   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
   // a smaller encoding and avoids a scaled-index.
   if (AM.Scale == 2 &&
       AM.BaseType == X86ISelAddressMode::RegBase &&
       AM.Base_Reg.getNode() == nullptr) {
     AM.Base_Reg = AM.IndexReg;
     AM.Scale = 1;
   }
 
   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
   // because it has a smaller encoding.
   // TODO: Which other code models can use this?
   if (TM.getCodeModel() == CodeModel::Small &&
       Subtarget->is64Bit() &&
       AM.Scale == 1 &&
       AM.BaseType == X86ISelAddressMode::RegBase &&
       AM.Base_Reg.getNode() == nullptr &&
       AM.IndexReg.getNode() == nullptr &&
       AM.SymbolFlags == X86II::MO_NO_FLAG &&
       AM.hasSymbolicDisplacement())
     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
 
   return false;
 }
 
 bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM,
                                unsigned Depth) {
   // Add an artificial use to this node so that we can keep track of
   // it if it gets CSE'd with a different node.
   HandleSDNode Handle(N);
 
   X86ISelAddressMode Backup = AM;
   if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
       !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1))
     return false;
   AM = Backup;
 
   // Try again after commuting the operands.
   if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
       !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
     return false;
   AM = Backup;
 
   // If we couldn't fold both operands into the address at the same time,
   // see if we can just put each operand into a register and fold at least
   // the add.
   if (AM.BaseType == X86ISelAddressMode::RegBase &&
       !AM.Base_Reg.getNode() &&
       !AM.IndexReg.getNode()) {
     N = Handle.getValue();
     AM.Base_Reg = N.getOperand(0);
     AM.IndexReg = N.getOperand(1);
     AM.Scale = 1;
     return false;
   }
   N = Handle.getValue();
   return true;
 }
 
 // Insert a node into the DAG at least before the Pos node's position. This
 // will reposition the node as needed, and will assign it a node ID that is <=
 // the Pos node's ID. Note that this does *not* preserve the uniqueness of node
 // IDs! The selection DAG must no longer depend on their uniqueness when this
 // is used.
 static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
   if (N->getNodeId() == -1 ||
       (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
        SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
     DAG.RepositionNode(Pos->getIterator(), N.getNode());
     // Mark Node as invalid for pruning as after this it may be a successor to a
     // selected node but otherwise be in the same position of Pos.
     // Conservatively mark it with the same -abs(Id) to assure node id
     // invariant is preserved.
     N->setNodeId(Pos->getNodeId());
     SelectionDAGISel::InvalidateNodeId(N.getNode());
   }
 }
 
 // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
 // safe. This allows us to convert the shift and and into an h-register
 // extract and a scaled index. Returns false if the simplification is
 // performed.
 static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
                                       uint64_t Mask,
                                       SDValue Shift, SDValue X,
                                       X86ISelAddressMode &AM) {
   if (Shift.getOpcode() != ISD::SRL ||
       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
       !Shift.hasOneUse())
     return true;
 
   int ScaleLog = 8 - Shift.getConstantOperandVal(1);
   if (ScaleLog <= 0 || ScaleLog >= 4 ||
       Mask != (0xffu << ScaleLog))
     return true;
 
   MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
   SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
   SDValue NewMask = DAG.getConstant(0xff, DL, VT);
   SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
   SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
   SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
 
   // Insert the new nodes into the topological ordering. We must do this in
   // a valid topological ordering as nothing is going to go back and re-sort
   // these nodes. We continually insert before 'N' in sequence as this is
   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
   // hierarchy left to express.
   insertDAGNode(DAG, N, Eight);
   insertDAGNode(DAG, N, Srl);
   insertDAGNode(DAG, N, NewMask);
   insertDAGNode(DAG, N, And);
   insertDAGNode(DAG, N, ShlCount);
   insertDAGNode(DAG, N, Shl);
   DAG.ReplaceAllUsesWith(N, Shl);
   AM.IndexReg = And;
   AM.Scale = (1 << ScaleLog);
   return false;
 }
 
 // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
 // allows us to fold the shift into this addressing mode. Returns false if the
 // transform succeeded.
 static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
                                         uint64_t Mask,
                                         SDValue Shift, SDValue X,
                                         X86ISelAddressMode &AM) {
   if (Shift.getOpcode() != ISD::SHL ||
       !isa<ConstantSDNode>(Shift.getOperand(1)))
     return true;
 
   // Not likely to be profitable if either the AND or SHIFT node has more
   // than one use (unless all uses are for address computation). Besides,
   // isel mechanism requires their node ids to be reused.
   if (!N.hasOneUse() || !Shift.hasOneUse())
     return true;
 
   // Verify that the shift amount is something we can fold.
   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
     return true;
 
   MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
   SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
 
   // Insert the new nodes into the topological ordering. We must do this in
   // a valid topological ordering as nothing is going to go back and re-sort
   // these nodes. We continually insert before 'N' in sequence as this is
   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
   // hierarchy left to express.
   insertDAGNode(DAG, N, NewMask);
   insertDAGNode(DAG, N, NewAnd);
   insertDAGNode(DAG, N, NewShift);
   DAG.ReplaceAllUsesWith(N, NewShift);
 
   AM.Scale = 1 << ShiftAmt;
   AM.IndexReg = NewAnd;
   return false;
 }
 
 // Implement some heroics to detect shifts of masked values where the mask can
 // be replaced by extending the shift and undoing that in the addressing mode
 // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
 // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
 // the addressing mode. This results in code such as:
 //
 //   int f(short *y, int *lookup_table) {
 //     ...
 //     return *y + lookup_table[*y >> 11];
 //   }
 //
 // Turning into:
 //   movzwl (%rdi), %eax
 //   movl %eax, %ecx
 //   shrl $11, %ecx
 //   addl (%rsi,%rcx,4), %eax
 //
 // Instead of:
 //   movzwl (%rdi), %eax
 //   movl %eax, %ecx
 //   shrl $9, %ecx
 //   andl $124, %rcx
 //   addl (%rsi,%rcx), %eax
 //
 // Note that this function assumes the mask is provided as a mask *after* the
 // value is shifted. The input chain may or may not match that, but computing
 // such a mask is trivial.
 static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
                                     uint64_t Mask,
                                     SDValue Shift, SDValue X,
                                     X86ISelAddressMode &AM) {
   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
       !isa<ConstantSDNode>(Shift.getOperand(1)))
     return true;
 
   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
   unsigned MaskLZ = countLeadingZeros(Mask);
   unsigned MaskTZ = countTrailingZeros(Mask);
 
   // The amount of shift we're trying to fit into the addressing mode is taken
   // from the trailing zeros of the mask.
   unsigned AMShiftAmt = MaskTZ;
 
   // There is nothing we can do here unless the mask is removing some bits.
   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
   if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
 
   // We also need to ensure that mask is a continuous run of bits.
   if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
 
   // Scale the leading zero count down based on the actual size of the value.
   // Also scale it down based on the size of the shift.
   unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
   if (MaskLZ < ScaleDown)
     return true;
   MaskLZ -= ScaleDown;
 
   // The final check is to ensure that any masked out high bits of X are
   // already known to be zero. Otherwise, the mask has a semantic impact
   // other than masking out a couple of low bits. Unfortunately, because of
   // the mask, zero extensions will be removed from operands in some cases.
   // This code works extra hard to look through extensions because we can
   // replace them with zero extensions cheaply if necessary.
   bool ReplacingAnyExtend = false;
   if (X.getOpcode() == ISD::ANY_EXTEND) {
     unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
                           X.getOperand(0).getSimpleValueType().getSizeInBits();
     // Assume that we'll replace the any-extend with a zero-extend, and
     // narrow the search to the extended value.
     X = X.getOperand(0);
     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
     ReplacingAnyExtend = true;
   }
   APInt MaskedHighBits =
     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
   KnownBits Known = DAG.computeKnownBits(X);
   if (MaskedHighBits != Known.Zero) return true;
 
   // We've identified a pattern that can be transformed into a single shift
   // and an addressing mode. Make it so.
   MVT VT = N.getSimpleValueType();
   if (ReplacingAnyExtend) {
     assert(X.getValueType() != VT);
     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
     SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
     insertDAGNode(DAG, N, NewX);
     X = NewX;
   }
   SDLoc DL(N);
   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
 
   // Insert the new nodes into the topological ordering. We must do this in
   // a valid topological ordering as nothing is going to go back and re-sort
   // these nodes. We continually insert before 'N' in sequence as this is
   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
   // hierarchy left to express.
   insertDAGNode(DAG, N, NewSRLAmt);
   insertDAGNode(DAG, N, NewSRL);
   insertDAGNode(DAG, N, NewSHLAmt);
   insertDAGNode(DAG, N, NewSHL);
   DAG.ReplaceAllUsesWith(N, NewSHL);
 
   AM.Scale = 1 << AMShiftAmt;
   AM.IndexReg = NewSRL;
   return false;
 }
 
 // Transform "(X >> SHIFT) & (MASK << C1)" to
 // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
 // matched to a BEXTR later. Returns false if the simplification is performed.
 static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
                                    uint64_t Mask,
                                    SDValue Shift, SDValue X,
                                    X86ISelAddressMode &AM,
                                    const X86Subtarget &Subtarget) {
   if (Shift.getOpcode() != ISD::SRL ||
       !isa<ConstantSDNode>(Shift.getOperand(1)) ||
       !Shift.hasOneUse() || !N.hasOneUse())
     return true;
 
   // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
   if (!Subtarget.hasTBM() &&
       !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR()))
     return true;
 
   // We need to ensure that mask is a continuous run of bits.
   if (!isShiftedMask_64(Mask)) return true;
 
   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
 
   // The amount of shift we're trying to fit into the addressing mode is taken
   // from the trailing zeros of the mask.
   unsigned AMShiftAmt = countTrailingZeros(Mask);
 
   // There is nothing we can do here unless the mask is removing some bits.
   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
   if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
 
   MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
   SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
   SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
 
   // Insert the new nodes into the topological ordering. We must do this in
   // a valid topological ordering as nothing is going to go back and re-sort
   // these nodes. We continually insert before 'N' in sequence as this is
   // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
   // hierarchy left to express.
   insertDAGNode(DAG, N, NewSRLAmt);
   insertDAGNode(DAG, N, NewSRL);
   insertDAGNode(DAG, N, NewMask);
   insertDAGNode(DAG, N, NewAnd);
   insertDAGNode(DAG, N, NewSHLAmt);
   insertDAGNode(DAG, N, NewSHL);
   DAG.ReplaceAllUsesWith(N, NewSHL);
 
   AM.Scale = 1 << AMShiftAmt;
   AM.IndexReg = NewAnd;
   return false;
 }
 
 bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                               unsigned Depth) {
   SDLoc dl(N);
   LLVM_DEBUG({
     dbgs() << "MatchAddress: ";
     AM.dump(CurDAG);
   });
   // Limit recursion.
   if (Depth > 5)
     return matchAddressBase(N, AM);
 
   // If this is already a %rip relative address, we can only merge immediates
   // into it.  Instead of handling this in every case, we handle it here.
   // RIP relative addressing: %rip + 32-bit displacement!
   if (AM.isRIPRelative()) {
     // FIXME: JumpTable and ExternalSymbol address currently don't like
     // displacements.  It isn't very important, but this should be fixed for
     // consistency.
     if (!(AM.ES || AM.MCSym) && AM.JT != -1)
       return true;
 
     if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
       if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
         return false;
     return true;
   }
 
   switch (N.getOpcode()) {
   default: break;
   case ISD::LOCAL_RECOVER: {
     if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
       if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
         // Use the symbol and don't prefix it.
         AM.MCSym = ESNode->getMCSymbol();
         return false;
       }
     break;
   }
   case ISD::Constant: {
     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
     if (!foldOffsetIntoAddress(Val, AM))
       return false;
     break;
   }
 
   case X86ISD::Wrapper:
   case X86ISD::WrapperRIP:
     if (!matchWrapper(N, AM))
       return false;
     break;
 
   case ISD::LOAD:
     if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
       return false;
     break;
 
   case ISD::FrameIndex:
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == nullptr &&
         (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
       return false;
     }
     break;
 
   case ISD::SHL:
     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
       break;
 
     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       unsigned Val = CN->getZExtValue();
       // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
       // that the base operand remains free for further matching. If
       // the base doesn't end up getting used, a post-processing step
       // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
       if (Val == 1 || Val == 2 || Val == 3) {
         AM.Scale = 1 << Val;
         SDValue ShVal = N.getOperand(0);
 
         // Okay, we know that we have a scale by now.  However, if the scaled
         // value is an add of something and a constant, we can fold the
         // constant into the disp field here.
         if (CurDAG->isBaseWithConstantOffset(ShVal)) {
           AM.IndexReg = ShVal.getOperand(0);
           ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
           uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
           if (!foldOffsetIntoAddress(Disp, AM))
             return false;
         }
 
         AM.IndexReg = ShVal;
         return false;
       }
     }
     break;
 
   case ISD::SRL: {
     // Scale must not be used already.
     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
 
     SDValue And = N.getOperand(0);
     if (And.getOpcode() != ISD::AND) break;
     SDValue X = And.getOperand(0);
 
     // We only handle up to 64-bit values here as those are what matter for
     // addressing mode optimizations.
     if (X.getSimpleValueType().getSizeInBits() > 64) break;
 
     // The mask used for the transform is expected to be post-shift, but we
     // found the shift first so just apply the shift to the mask before passing
     // it down.
     if (!isa<ConstantSDNode>(N.getOperand(1)) ||
         !isa<ConstantSDNode>(And.getOperand(1)))
       break;
     uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
 
     // Try to fold the mask and shift into the scale, and return false if we
     // succeed.
     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
       return false;
     break;
   }
 
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
     // A mul_lohi where we need the low part can be folded as a plain multiply.
     if (N.getResNo() != 0) break;
     LLVM_FALLTHROUGH;
   case ISD::MUL:
   case X86ISD::MUL_IMM:
     // X*[3,5,9] -> X+X*[2,4,8]
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == nullptr &&
         AM.IndexReg.getNode() == nullptr) {
       if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
             CN->getZExtValue() == 9) {
           AM.Scale = unsigned(CN->getZExtValue())-1;
 
           SDValue MulVal = N.getOperand(0);
           SDValue Reg;
 
           // Okay, we know that we have a scale by now.  However, if the scaled
           // value is an add of something and a constant, we can fold the
           // constant into the disp field here.
           if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
               isa<ConstantSDNode>(MulVal.getOperand(1))) {
             Reg = MulVal.getOperand(0);
             ConstantSDNode *AddVal =
               cast<ConstantSDNode>(MulVal.getOperand(1));
             uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
             if (foldOffsetIntoAddress(Disp, AM))
               Reg = N.getOperand(0);
           } else {
             Reg = N.getOperand(0);
           }
 
           AM.IndexReg = AM.Base_Reg = Reg;
           return false;
         }
     }
     break;
 
   case ISD::SUB: {
     // Given A-B, if A can be completely folded into the address and
     // the index field with the index field unused, use -B as the index.
     // This is a win if a has multiple parts that can be folded into
     // the address. Also, this saves a mov if the base register has
     // other uses, since it avoids a two-address sub instruction, however
     // it costs an additional mov if the index register has other uses.
 
     // Add an artificial use to this node so that we can keep track of
     // it if it gets CSE'd with a different node.
     HandleSDNode Handle(N);
 
     // Test if the LHS of the sub can be folded.
     X86ISelAddressMode Backup = AM;
     if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
       AM = Backup;
       break;
     }
     // Test if the index field is free for use.
     if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
       AM = Backup;
       break;
     }
 
     int Cost = 0;
     SDValue RHS = Handle.getValue().getOperand(1);
     // If the RHS involves a register with multiple uses, this
     // transformation incurs an extra mov, due to the neg instruction
     // clobbering its operand.
     if (!RHS.getNode()->hasOneUse() ||
         RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
         RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
         RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
         (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
          RHS.getOperand(0).getValueType() == MVT::i32))
       ++Cost;
     // If the base is a register with multiple uses, this
     // transformation may save a mov.
     // FIXME: Don't rely on DELETED_NODEs.
     if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() &&
          AM.Base_Reg->getOpcode() != ISD::DELETED_NODE &&
          !AM.Base_Reg.getNode()->hasOneUse()) ||
         AM.BaseType == X86ISelAddressMode::FrameIndexBase)
       --Cost;
     // If the folded LHS was interesting, this transformation saves
     // address arithmetic.
     if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
         ((AM.Disp != 0) && (Backup.Disp == 0)) +
         (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
       --Cost;
     // If it doesn't look like it may be an overall win, don't do it.
     if (Cost >= 0) {
       AM = Backup;
       break;
     }
 
     // Ok, the transformation is legal and appears profitable. Go for it.
     SDValue Zero = CurDAG->getConstant(0, dl, N.getValueType());
     SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
     AM.IndexReg = Neg;
     AM.Scale = 1;
 
     // Insert the new nodes into the topological ordering.
     insertDAGNode(*CurDAG, Handle.getValue(), Zero);
     insertDAGNode(*CurDAG, Handle.getValue(), Neg);
     return false;
   }
 
   case ISD::ADD:
     if (!matchAdd(N, AM, Depth))
       return false;
     break;
 
   case ISD::OR:
     // We want to look through a transform in InstCombine and DAGCombiner that
     // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
     // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
     // An 'lea' can then be used to match the shift (multiply) and add:
     // and $1, %esi
     // lea (%rsi, %rdi, 8), %rax
     if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
         !matchAdd(N, AM, Depth))
       return false;
     break;
 
   case ISD::AND: {
     // Perform some heroic transforms on an and of a constant-count shift
     // with a constant to enable use of the scaled offset field.
 
     // Scale must not be used already.
     if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
 
     SDValue Shift = N.getOperand(0);
     if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
     SDValue X = Shift.getOperand(0);
 
     // We only handle up to 64-bit values here as those are what matter for
     // addressing mode optimizations.
     if (X.getSimpleValueType().getSizeInBits() > 64) break;
 
     if (!isa<ConstantSDNode>(N.getOperand(1)))
       break;
     uint64_t Mask = N.getConstantOperandVal(1);
 
     // Try to fold the mask and shift into an extract and scale.
     if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
       return false;
 
     // Try to fold the mask and shift directly into the scale.
     if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
       return false;
 
     // Try to swap the mask and shift to place shifts which can be done as
     // a scale on the outside of the mask.
     if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM))
       return false;
 
     // Try to fold the mask and shift into BEXTR and scale.
     if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
       return false;
 
     break;
   }
   }
 
   return matchAddressBase(N, AM);
 }
 
 /// Helper for MatchAddress. Add the specified node to the
 /// specified addressing mode without any further recursion.
 bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   // Is the base register already occupied?
   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
     // If so, check to see if the scale index register is set.
     if (!AM.IndexReg.getNode()) {
       AM.IndexReg = N;
       AM.Scale = 1;
       return false;
     }
 
     // Otherwise, we cannot select it.
     return true;
   }
 
   // Default, generate it as a register.
   AM.BaseType = X86ISelAddressMode::RegBase;
   AM.Base_Reg = N;
   return false;
 }
 
 /// Helper for selectVectorAddr. Handles things that can be folded into a
 /// gather scatter address. The index register and scale should have already
 /// been handled.
 bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
   // TODO: Support other operations.
   switch (N.getOpcode()) {
   case ISD::Constant: {
     uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
     if (!foldOffsetIntoAddress(Val, AM))
       return false;
     break;
   }
   case X86ISD::Wrapper:
     if (!matchWrapper(N, AM))
       return false;
     break;
   }
 
   return matchAddressBase(N, AM);
 }
 
 bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
                                        SDValue &Scale, SDValue &Index,
                                        SDValue &Disp, SDValue &Segment) {
   X86ISelAddressMode AM;
   auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
   AM.IndexReg = Mgs->getIndex();
   AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
 
   unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
   // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
   if (AddrSpace == 256)
     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
   if (AddrSpace == 257)
     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
   if (AddrSpace == 258)
     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
 
   // Try to match into the base and displacement fields.
   if (matchVectorAddress(N, AM))
     return false;
 
   MVT VT = N.getSimpleValueType();
   if (AM.BaseType == X86ISelAddressMode::RegBase) {
     if (!AM.Base_Reg.getNode())
       AM.Base_Reg = CurDAG->getRegister(0, VT);
   }
 
   getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
   return true;
 }
 
 /// Returns true if it is able to pattern match an addressing mode.
 /// It returns the operands which make up the maximal addressing mode it can
 /// match by reference.
 ///
 /// Parent is the parent node of the addr operand that is being matched.  It
 /// is always a load, store, atomic node, or null.  It is only null when
 /// checking memory operands for inline asm nodes.
 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                                  SDValue &Scale, SDValue &Index,
                                  SDValue &Disp, SDValue &Segment) {
   X86ISelAddressMode AM;
 
   if (Parent &&
       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
       // that are not a MemSDNode, and thus don't have proper addrspace info.
       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
       Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
       Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
     unsigned AddrSpace =
       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
     // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
     if (AddrSpace == 256)
       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
     if (AddrSpace == 257)
       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
     if (AddrSpace == 258)
       AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
   }
 
   if (matchAddress(N, AM))
     return false;
 
   MVT VT = N.getSimpleValueType();
   if (AM.BaseType == X86ISelAddressMode::RegBase) {
     if (!AM.Base_Reg.getNode())
       AM.Base_Reg = CurDAG->getRegister(0, VT);
   }
 
   if (!AM.IndexReg.getNode())
     AM.IndexReg = CurDAG->getRegister(0, VT);
 
   getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
   return true;
 }
 
 // We can only fold a load if all nodes between it and the root node have a
 // single use. If there are additional uses, we could end up duplicating the
 // load.
 static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
   while (User != Root) {
     if (!User->hasOneUse())
       return false;
     User = *User->use_begin();
   }
 
   return true;
 }
 
 /// Match a scalar SSE load. In particular, we want to match a load whose top
 /// elements are either undef or zeros. The load flavor is derived from the
 /// type of N, which is either v4f32 or v2f64.
 ///
 /// We also return:
 ///   PatternChainNode: this is the matched node that has a chain input and
 ///   output.
 bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
                                           SDValue N, SDValue &Base,
                                           SDValue &Scale, SDValue &Index,
                                           SDValue &Disp, SDValue &Segment,
                                           SDValue &PatternNodeWithChain) {
   if (!hasSingleUsesFromRoot(Root, Parent))
     return false;
 
   // We can allow a full vector load here since narrowing a load is ok.
   if (ISD::isNON_EXTLoad(N.getNode())) {
     PatternNodeWithChain = N;
     if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
         IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
                         Segment);
     }
   }
 
   // We can also match the special zero extended load opcode.
   if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
     PatternNodeWithChain = N;
     if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
         IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
       auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
       return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
                         Segment);
     }
   }
 
   // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
   // once. Otherwise the load might get duplicated and the chain output of the
   // duplicate load will not be observed by all dependencies.
   if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
     PatternNodeWithChain = N.getOperand(0);
     if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
         IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
         IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
                         Segment);
     }
   }
 
   // Also handle the case where we explicitly require zeros in the top
   // elements.  This is a vector shuffle from the zero vector.
   if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
       // Check to see if the top elements are all zeros (or bitcast of zeros).
       N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
       N.getOperand(0).getNode()->hasOneUse()) {
     PatternNodeWithChain = N.getOperand(0).getOperand(0);
     if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
         IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
         IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
       // Okay, this is a zero extending load.  Fold it.
       LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
       return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
                         Segment);
     }
   }
 
   return false;
 }
 
 
 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
   if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
     uint64_t ImmVal = CN->getZExtValue();
     if (!isUInt<32>(ImmVal))
       return false;
 
     Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
     return true;
   }
 
   // In static codegen with small code model, we can get the address of a label
   // into a register with 'movl'
   if (N->getOpcode() != X86ISD::Wrapper)
     return false;
 
   N = N.getOperand(0);
 
   // At least GNU as does not accept 'movl' for TPOFF relocations.
   // FIXME: We could use 'movl' when we know we are targeting MC.
   if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
     return false;
 
   Imm = N;
   if (N->getOpcode() != ISD::TargetGlobalAddress)
     return TM.getCodeModel() == CodeModel::Small;
 
   Optional<ConstantRange> CR =
       cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
   if (!CR)
     return TM.getCodeModel() == CodeModel::Small;
 
   return CR->getUnsignedMax().ult(1ull << 32);
 }
 
 bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
                                          SDValue &Scale, SDValue &Index,
                                          SDValue &Disp, SDValue &Segment) {
   // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
   SDLoc DL(N);
 
   if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
     return false;
 
   RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
   if (RN && RN->getReg() == 0)
     Base = CurDAG->getRegister(0, MVT::i64);
   else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(Base)) {
     // Base could already be %rip, particularly in the x32 ABI.
     Base = SDValue(CurDAG->getMachineNode(
                        TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
                        CurDAG->getTargetConstant(0, DL, MVT::i64),
                        Base,
                        CurDAG->getTargetConstant(X86::sub_32bit, DL, MVT::i32)),
                    0);
   }
 
   RN = dyn_cast<RegisterSDNode>(Index);
   if (RN && RN->getReg() == 0)
     Index = CurDAG->getRegister(0, MVT::i64);
   else {
     assert(Index.getValueType() == MVT::i32 &&
            "Expect to be extending 32-bit registers for use in LEA");
     Index = SDValue(CurDAG->getMachineNode(
                         TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
                         CurDAG->getTargetConstant(0, DL, MVT::i64),
                         Index,
                         CurDAG->getTargetConstant(X86::sub_32bit, DL,
                                                   MVT::i32)),
                     0);
   }
 
   return true;
 }
 
 /// Calls SelectAddr and determines if the maximal addressing
 /// mode it matches can be cost effectively emitted as an LEA instruction.
 bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
                                     SDValue &Base, SDValue &Scale,
                                     SDValue &Index, SDValue &Disp,
                                     SDValue &Segment) {
   X86ISelAddressMode AM;
 
   // Save the DL and VT before calling matchAddress, it can invalidate N.
   SDLoc DL(N);
   MVT VT = N.getSimpleValueType();
 
   // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
   // segments.
   SDValue Copy = AM.Segment;
   SDValue T = CurDAG->getRegister(0, MVT::i32);
   AM.Segment = T;
   if (matchAddress(N, AM))
     return false;
   assert (T == AM.Segment);
   AM.Segment = Copy;
 
   unsigned Complexity = 0;
   if (AM.BaseType == X86ISelAddressMode::RegBase)
     if (AM.Base_Reg.getNode())
       Complexity = 1;
     else
       AM.Base_Reg = CurDAG->getRegister(0, VT);
   else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
     Complexity = 4;
 
   if (AM.IndexReg.getNode())
     Complexity++;
   else
     AM.IndexReg = CurDAG->getRegister(0, VT);
 
   // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
   // a simple shift.
   if (AM.Scale > 1)
     Complexity++;
 
   // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
   // to a LEA. This is determined with some experimentation but is by no means
   // optimal (especially for code size consideration). LEA is nice because of
   // its three-address nature. Tweak the cost function again when we can run
   // convertToThreeAddress() at register allocation time.
   if (AM.hasSymbolicDisplacement()) {
     // For X86-64, always use LEA to materialize RIP-relative addresses.
     if (Subtarget->is64Bit())
       Complexity = 4;
     else
       Complexity += 2;
   }
 
   if (AM.Disp && (AM.Base_Reg.getNode() || AM.IndexReg.getNode()))
     Complexity++;
 
   // If it isn't worth using an LEA, reject it.
   if (Complexity <= 2)
     return false;
 
   getAddressOperands(AM, DL, Base, Scale, Index, Disp, Segment);
   return true;
 }
 
 /// This is only run on TargetGlobalTLSAddress nodes.
 bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
                                         SDValue &Scale, SDValue &Index,
                                         SDValue &Disp, SDValue &Segment) {
   assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
 
   X86ISelAddressMode AM;
   AM.GV = GA->getGlobal();
   AM.Disp += GA->getOffset();
   AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
   AM.SymbolFlags = GA->getTargetFlags();
 
   if (N.getValueType() == MVT::i32) {
     AM.Scale = 1;
     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
   } else {
     AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
   }
 
   getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
   return true;
 }
 
 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
   if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
     Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
                                    N.getValueType());
     return true;
   }
 
   // Keep track of the original value type and whether this value was
   // truncated. If we see a truncation from pointer type to VT that truncates
   // bits that are known to be zero, we can use a narrow reference.
   EVT VT = N.getValueType();
   bool WasTruncated = false;
   if (N.getOpcode() == ISD::TRUNCATE) {
     WasTruncated = true;
     N = N.getOperand(0);
   }
 
   if (N.getOpcode() != X86ISD::Wrapper)
     return false;
 
   // We can only use non-GlobalValues as immediates if they were not truncated,
   // as we do not have any range information. If we have a GlobalValue and the
   // address was not truncated, we can select it as an operand directly.
   unsigned Opc = N.getOperand(0)->getOpcode();
   if (Opc != ISD::TargetGlobalAddress || !WasTruncated) {
     Op = N.getOperand(0);
     // We can only select the operand directly if we didn't have to look past a
     // truncate.
     return !WasTruncated;
   }
 
   // Check that the global's range fits into VT.
   auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
   Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
   if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits()))
     return false;
 
   // Okay, we can use a narrow reference.
   Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
                                       GA->getOffset(), GA->getTargetFlags());
   return true;
 }
 
 bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
                                   SDValue &Base, SDValue &Scale,
                                   SDValue &Index, SDValue &Disp,
                                   SDValue &Segment) {
   if (!ISD::isNON_EXTLoad(N.getNode()) ||
       !IsProfitableToFold(N, P, Root) ||
       !IsLegalToFold(N, P, Root, OptLevel))
     return false;
 
   return selectAddr(N.getNode(),
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
 /// Return an SDNode that returns the value of the global base register.
 /// Output instructions required to initialize the global base register,
 /// if necessary.
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
   auto &DL = MF->getDataLayout();
   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
 }
 
 bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
   if (N->getOpcode() == ISD::TRUNCATE)
     N = N->getOperand(0).getNode();
   if (N->getOpcode() != X86ISD::Wrapper)
     return false;
 
   auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
   if (!GA)
     return false;
 
   Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
   return CR && CR->getSignedMin().sge(-1ull << Width) &&
          CR->getSignedMax().slt(1ull << Width);
 }
 
 static X86::CondCode getCondFromOpc(unsigned Opc) {
   X86::CondCode CC = X86::COND_INVALID;
   if (CC == X86::COND_INVALID)
     CC = X86::getCondFromBranchOpc(Opc);
   if (CC == X86::COND_INVALID)
     CC = X86::getCondFromSETOpc(Opc);
   if (CC == X86::COND_INVALID)
     CC = X86::getCondFromCMovOpc(Opc);
 
   return CC;
 }
 
 /// Test whether the given X86ISD::CMP node has any users that use a flag
 /// other than ZF.
 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
   // Examine each user of the node.
   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
          UI != UE; ++UI) {
     // Only check things that use the flags.
     if (UI.getUse().getResNo() != Flags.getResNo())
       continue;
     // Only examine CopyToReg uses that copy to EFLAGS.
     if (UI->getOpcode() != ISD::CopyToReg ||
         cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
       return false;
     // Examine each user of the CopyToReg use.
     for (SDNode::use_iterator FlagUI = UI->use_begin(),
            FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
       // Only examine the Flag result.
       if (FlagUI.getUse().getResNo() != 1) continue;
       // Anything unusual: assume conservatively.
       if (!FlagUI->isMachineOpcode()) return false;
       // Examine the condition code of the user.
       X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
 
       switch (CC) {
       // Comparisons which only use the zero flag.
       case X86::COND_E: case X86::COND_NE:
         continue;
       // Anything else: assume conservatively.
       default:
         return false;
       }
     }
   }
   return true;
 }
 
 /// Test whether the given X86ISD::CMP node has any uses which require the SF
 /// flag to be accurate.
 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
   // Examine each user of the node.
   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
          UI != UE; ++UI) {
     // Only check things that use the flags.
     if (UI.getUse().getResNo() != Flags.getResNo())
       continue;
     // Only examine CopyToReg uses that copy to EFLAGS.
     if (UI->getOpcode() != ISD::CopyToReg ||
         cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
       return false;
     // Examine each user of the CopyToReg use.
     for (SDNode::use_iterator FlagUI = UI->use_begin(),
            FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
       // Only examine the Flag result.
       if (FlagUI.getUse().getResNo() != 1) continue;
       // Anything unusual: assume conservatively.
       if (!FlagUI->isMachineOpcode()) return false;
       // Examine the condition code of the user.
       X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
 
       switch (CC) {
       // Comparisons which don't examine the SF flag.
       case X86::COND_A: case X86::COND_AE:
       case X86::COND_B: case X86::COND_BE:
       case X86::COND_E: case X86::COND_NE:
       case X86::COND_O: case X86::COND_NO:
       case X86::COND_P: case X86::COND_NP:
         continue;
       // Anything else: assume conservatively.
       default:
         return false;
       }
     }
   }
   return true;
 }
 
 static bool mayUseCarryFlag(X86::CondCode CC) {
   switch (CC) {
   // Comparisons which don't examine the CF flag.
   case X86::COND_O: case X86::COND_NO:
   case X86::COND_E: case X86::COND_NE:
   case X86::COND_S: case X86::COND_NS:
   case X86::COND_P: case X86::COND_NP:
   case X86::COND_L: case X86::COND_GE:
   case X86::COND_G: case X86::COND_LE:
     return false;
   // Anything else: assume conservatively.
   default:
     return true;
   }
 }
 
 /// Test whether the given node which sets flags has any uses which require the
 /// CF flag to be accurate.
  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
   // Examine each user of the node.
   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
          UI != UE; ++UI) {
     // Only check things that use the flags.
     if (UI.getUse().getResNo() != Flags.getResNo())
       continue;
 
     unsigned UIOpc = UI->getOpcode();
 
     if (UIOpc == ISD::CopyToReg) {
       // Only examine CopyToReg uses that copy to EFLAGS.
       if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
         return false;
       // Examine each user of the CopyToReg use.
       for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
            FlagUI != FlagUE; ++FlagUI) {
         // Only examine the Flag result.
         if (FlagUI.getUse().getResNo() != 1)
           continue;
         // Anything unusual: assume conservatively.
         if (!FlagUI->isMachineOpcode())
           return false;
         // Examine the condition code of the user.
         X86::CondCode CC = getCondFromOpc(FlagUI->getMachineOpcode());
 
         if (mayUseCarryFlag(CC))
           return false;
       }
 
       // This CopyToReg is ok. Move on to the next user.
       continue;
     }
 
     // This might be an unselected node. So look for the pre-isel opcodes that
     // use flags.
     unsigned CCOpNo;
     switch (UIOpc) {
     default:
       // Something unusual. Be conservative.
       return false;
     case X86ISD::SETCC:       CCOpNo = 0; break;
     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
     case X86ISD::CMOV:        CCOpNo = 2; break;
     case X86ISD::BRCOND:      CCOpNo = 2; break;
     }
 
     X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
     if (mayUseCarryFlag(CC))
       return false;
   }
   return true;
 }
 
 /// Check whether or not the chain ending in StoreNode is suitable for doing
 /// the {load; op; store} to modify transformation.
 static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
                                         SDValue StoredVal, SelectionDAG *CurDAG,
                                         unsigned LoadOpNo,
                                         LoadSDNode *&LoadNode,
                                         SDValue &InputChain) {
   // Is the stored value result 0 of the operation?
   if (StoredVal.getResNo() != 0) return false;
 
   // Are there other uses of the operation other than the store?
   if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false;
 
   // Is the store non-extending and non-indexed?
   if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
     return false;
 
   SDValue Load = StoredVal->getOperand(LoadOpNo);
   // Is the stored value a non-extending and non-indexed load?
   if (!ISD::isNormalLoad(Load.getNode())) return false;
 
   // Return LoadNode by reference.
   LoadNode = cast<LoadSDNode>(Load);
 
   // Is store the only read of the loaded value?
   if (!Load.hasOneUse())
     return false;
 
   // Is the address of the store the same as the load?
   if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
       LoadNode->getOffset() != StoreNode->getOffset())
     return false;
 
   bool FoundLoad = false;
   SmallVector<SDValue, 4> ChainOps;
   SmallVector<const SDNode *, 4> LoopWorklist;
   SmallPtrSet<const SDNode *, 16> Visited;
   const unsigned int Max = 1024;
 
   //  Visualization of Load-Op-Store fusion:
   // -------------------------
   // Legend:
   //    *-lines = Chain operand dependencies.
   //    |-lines = Normal operand dependencies.
   //    Dependencies flow down and right. n-suffix references multiple nodes.
   //
   //        C                        Xn  C
   //        *                         *  *
   //        *                          * *
   //  Xn  A-LD    Yn                    TF         Yn
   //   *    * \   |                       *        |
   //    *   *  \  |                        *       |
   //     *  *   \ |             =>       A--LD_OP_ST
   //      * *    \|                                 \
   //       TF    OP                                  \
   //         *   | \                                  Zn
   //          *  |  \
   //         A-ST    Zn
   //
 
   // This merge induced dependences from: #1: Xn -> LD, OP, Zn
   //                                      #2: Yn -> LD
   //                                      #3: ST -> Zn
 
   // Ensure the transform is safe by checking for the dual
   // dependencies to make sure we do not induce a loop.
 
   // As LD is a predecessor to both OP and ST we can do this by checking:
   //  a). if LD is a predecessor to a member of Xn or Yn.
   //  b). if a Zn is a predecessor to ST.
 
   // However, (b) can only occur through being a chain predecessor to
   // ST, which is the same as Zn being a member or predecessor of Xn,
   // which is a subset of LD being a predecessor of Xn. So it's
   // subsumed by check (a).
 
   SDValue Chain = StoreNode->getChain();
 
   // Gather X elements in ChainOps.
   if (Chain == Load.getValue(1)) {
     FoundLoad = true;
     ChainOps.push_back(Load.getOperand(0));
   } else if (Chain.getOpcode() == ISD::TokenFactor) {
     for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) {
       SDValue Op = Chain.getOperand(i);
       if (Op == Load.getValue(1)) {
         FoundLoad = true;
         // Drop Load, but keep its chain. No cycle check necessary.
         ChainOps.push_back(Load.getOperand(0));
         continue;
       }
       LoopWorklist.push_back(Op.getNode());
       ChainOps.push_back(Op);
     }
   }
 
   if (!FoundLoad)
     return false;
 
   // Worklist is currently Xn. Add Yn to worklist.
   for (SDValue Op : StoredVal->ops())
     if (Op.getNode() != LoadNode)
       LoopWorklist.push_back(Op.getNode());
 
   // Check (a) if Load is a predecessor to Xn + Yn
   if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
                                    true))
     return false;
 
   InputChain =
       CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
   return true;
 }
 
 // Change a chain of {load; op; store} of the same value into a simple op
 // through memory of that value, if the uses of the modified value and its
 // address are suitable.
 //
 // The tablegen pattern memory operand pattern is currently not able to match
 // the case where the EFLAGS on the original operation are used.
 //
 // To move this to tablegen, we'll need to improve tablegen to allow flags to
 // be transferred from a node in the pattern to the result node, probably with
 // a new keyword. For example, we have this
 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
 //   (implicit EFLAGS)]>;
 // but maybe need something like this
 // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
 //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
 //   (transferrable EFLAGS)]>;
 //
 // Until then, we manually fold these and instruction select the operation
 // here.
 bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
   StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
   SDValue StoredVal = StoreNode->getOperand(1);
   unsigned Opc = StoredVal->getOpcode();
 
   // Before we try to select anything, make sure this is memory operand size
   // and opcode we can handle. Note that this must match the code below that
   // actually lowers the opcodes.
   EVT MemVT = StoreNode->getMemoryVT();
   if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
       MemVT != MVT::i8)
     return false;
 
   bool IsCommutable = false;
   switch (Opc) {
   default:
     return false;
   case X86ISD::SUB:
   case X86ISD::SBB:
     break;
   case X86ISD::ADD:
   case X86ISD::ADC:
   case X86ISD::AND:
   case X86ISD::OR:
   case X86ISD::XOR:
     IsCommutable = true;
     break;
   }
 
   unsigned LoadOpNo = 0;
   LoadSDNode *LoadNode = nullptr;
   SDValue InputChain;
   if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
                                    LoadNode, InputChain)) {
     if (!IsCommutable)
       return false;
 
     // This operation is commutable, try the other operand.
     LoadOpNo = 1;
     if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
                                      LoadNode, InputChain))
       return false;
   }
 
   SDValue Base, Scale, Index, Disp, Segment;
   if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
                   Segment))
     return false;
 
   auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
                           unsigned Opc8) {
     switch (MemVT.getSimpleVT().SimpleTy) {
     case MVT::i64:
       return Opc64;
     case MVT::i32:
       return Opc32;
     case MVT::i16:
       return Opc16;
     case MVT::i8:
       return Opc8;
     default:
       llvm_unreachable("Invalid size!");
     }
   };
 
   MachineSDNode *Result;
   switch (Opc) {
   case X86ISD::ADD:
   case X86ISD::SUB:
     // Try to match inc/dec.
     if (!Subtarget->slowIncDec() ||
         CurDAG->getMachineFunction().getFunction().optForSize()) {
       bool IsOne = isOneConstant(StoredVal.getOperand(1));
       bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
       // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
       if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) {
         unsigned NewOpc = 
           ((Opc == X86ISD::ADD) == IsOne)
               ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
               : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
         const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
         Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
                                         MVT::Other, Ops);
         break;
       }
     }
     LLVM_FALLTHROUGH;
   case X86ISD::ADC:
   case X86ISD::SBB:
   case X86ISD::AND:
   case X86ISD::OR:
   case X86ISD::XOR: {
     auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
       switch (Opc) {
       case X86ISD::ADD:
         return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
                             X86::ADD8mr);
       case X86ISD::ADC:
         return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
                             X86::ADC8mr);
       case X86ISD::SUB:
         return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
                             X86::SUB8mr);
       case X86ISD::SBB:
         return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
                             X86::SBB8mr);
       case X86ISD::AND:
         return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
                             X86::AND8mr);
       case X86ISD::OR:
         return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
       case X86ISD::XOR:
         return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
                             X86::XOR8mr);
       default:
         llvm_unreachable("Invalid opcode!");
       }
     };
     auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
       switch (Opc) {
       case X86ISD::ADD:
         return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
       case X86ISD::ADC:
         return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
       case X86ISD::SUB:
         return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
       case X86ISD::SBB:
         return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
       case X86ISD::AND:
         return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
       case X86ISD::OR:
         return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
       case X86ISD::XOR:
         return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
       default:
         llvm_unreachable("Invalid opcode!");
       }
     };
     auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
       switch (Opc) {
       case X86ISD::ADD:
         return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
                             X86::ADD8mi);
       case X86ISD::ADC:
         return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
                             X86::ADC8mi);
       case X86ISD::SUB:
         return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
                             X86::SUB8mi);
       case X86ISD::SBB:
         return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
                             X86::SBB8mi);
       case X86ISD::AND:
         return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
                             X86::AND8mi);
       case X86ISD::OR:
         return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
                             X86::OR8mi);
       case X86ISD::XOR:
         return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
                             X86::XOR8mi);
       default:
         llvm_unreachable("Invalid opcode!");
       }
     };
 
     unsigned NewOpc = SelectRegOpcode(Opc);
     SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
 
     // See if the operand is a constant that we can fold into an immediate
     // operand.
     if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
       auto OperandV = OperandC->getAPIntValue();
 
       // Check if we can shrink the operand enough to fit in an immediate (or
       // fit into a smaller immediate) by negating it and switching the
       // operation.
       if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
           ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 &&
             (-OperandV).getMinSignedBits() <= 8) ||
            (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 &&
             (-OperandV).getMinSignedBits() <= 32)) &&
           hasNoCarryFlagUses(StoredVal.getValue(1))) {
         OperandV = -OperandV;
         Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
       }
 
       // First try to fit this into an Imm8 operand. If it doesn't fit, then try
       // the larger immediate operand.
       if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) {
         Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
         NewOpc = SelectImm8Opcode(Opc);
       } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() &&
                  (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) {
         Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
         NewOpc = SelectImmOpcode(Opc);
       }
     }
 
     if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) {
       SDValue CopyTo =
           CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
                                StoredVal.getOperand(2), SDValue());
 
       const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
                              Segment, Operand, CopyTo, CopyTo.getValue(1)};
       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
                                       Ops);
     } else {
       const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
                              Segment, Operand, InputChain};
       Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
                                       Ops);
     }
     break;
   }
   default:
     llvm_unreachable("Invalid opcode!");
   }
 
   MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
                                  LoadNode->getMemOperand()};
   CurDAG->setNodeMemRefs(Result, MemOps);
 
   // Update Load Chain uses as well.
   ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
   ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
   ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
   CurDAG->RemoveDeadNode(Node);
   return true;
 }
 
 // See if this is an  X & Mask  that we can match to BEXTR/BZHI.
 // Where Mask is one of the following patterns:
 //   a) x &  (1 << nbits) - 1
 //   b) x & ~(-1 << nbits)
 //   c) x &  (-1 >> (32 - y))
 //   d) x << (32 - y) >> (32 - y)
 bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
   assert(
       (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
       "Should be either an and-mask, or right-shift after clearing high bits.");
 
   // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
   if (!Subtarget->hasBMI() && !Subtarget->hasBMI2())
     return false;
 
   MVT NVT = Node->getSimpleValueType(0);
 
   // Only supported for 32 and 64 bits.
   if (NVT != MVT::i32 && NVT != MVT::i64)
     return false;
 
   unsigned Size = NVT.getSizeInBits();
 
   SDValue NBits;
 
   // If we have BMI2's BZHI, we are ok with muti-use patterns.
   // Else, if we only have BMI1's BEXTR, we require one-use.
   const bool CanHaveExtraUses = Subtarget->hasBMI2();
   auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
     return CanHaveExtraUses ||
            Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo());
   };
   auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); };
   auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); };
 
   // a) x & ((1 << nbits) + (-1))
   auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool {
     // Match `add`. Must only have one use!
     if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask))
       return false;
     // We should be adding all-ones constant (i.e. subtracting one.)
     if (!isAllOnesConstant(Mask->getOperand(1)))
       return false;
     // Match `1 << nbits`. Must only have one use!
     SDValue M0 = Mask->getOperand(0);
     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
       return false;
     if (!isOneConstant(M0->getOperand(0)))
       return false;
     NBits = M0->getOperand(1);
     return true;
   };
 
   // b) x & ~(-1 << nbits)
   auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool {
     // Match `~()`. Must only have one use!
     if (!isBitwiseNot(Mask) || !checkOneUse(Mask))
       return false;
     // Match `-1 << nbits`. Must only have one use!
     SDValue M0 = Mask->getOperand(0);
     if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0))
       return false;
     if (!isAllOnesConstant(M0->getOperand(0)))
       return false;
     NBits = M0->getOperand(1);
     return true;
   };
 
   // Match potentially-truncated (bitwidth - y)
   auto matchShiftAmt = [checkOneUse, Size, &NBits](SDValue ShiftAmt) {
     // Skip over a truncate of the shift amount.
     if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
       ShiftAmt = ShiftAmt.getOperand(0);
       // The trunc should have been the only user of the real shift amount.
       if (!checkOneUse(ShiftAmt))
         return false;
     }
     // Match the shift amount as: (bitwidth - y). It should go away, too.
     if (ShiftAmt.getOpcode() != ISD::SUB)
       return false;
     auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
     if (!V0 || V0->getZExtValue() != Size)
       return false;
     NBits = ShiftAmt.getOperand(1);
     return true;
   };
 
   // c) x &  (-1 >> (32 - y))
   auto matchPatternC = [&checkOneUse, matchShiftAmt](SDValue Mask) -> bool {
     // Match `l>>`. Must only have one use!
     if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask))
       return false;
     // We should be shifting all-ones constant.
     if (!isAllOnesConstant(Mask.getOperand(0)))
       return false;
     SDValue M1 = Mask.getOperand(1);
     // The shift amount should not be used externally.
     if (!checkOneUse(M1))
       return false;
     return matchShiftAmt(M1);
   };
 
   SDValue X;
 
   // d) x << (32 - y) >> (32 - y)
   auto matchPatternD = [&checkOneUse, &checkTwoUse, matchShiftAmt,
                         &X](SDNode *Node) -> bool {
     if (Node->getOpcode() != ISD::SRL)
       return false;
     SDValue N0 = Node->getOperand(0);
     if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0))
       return false;
     SDValue N1 = Node->getOperand(1);
     SDValue N01 = N0->getOperand(1);
     // Both of the shifts must be by the exact same value.
     // There should not be any uses of the shift amount outside of the pattern.
     if (N1 != N01 || !checkTwoUse(N1))
       return false;
     if (!matchShiftAmt(N1))
       return false;
     X = N0->getOperand(0);
     return true;
   };
 
   auto matchLowBitMask = [&matchPatternA, &matchPatternB,
                           &matchPatternC](SDValue Mask) -> bool {
     // FIXME: pattern c.
     return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask);
   };
 
   if (Node->getOpcode() == ISD::AND) {
     X = Node->getOperand(0);
     SDValue Mask = Node->getOperand(1);
 
     if (matchLowBitMask(Mask)) {
       // Great.
     } else {
       std::swap(X, Mask);
       if (!matchLowBitMask(Mask))
         return false;
     }
   } else if (!matchPatternD(Node))
     return false;
 
   SDLoc DL(Node);
 
   // If we do *NOT* have BMI2, let's find out if the if the 'X' is *logically*
   // shifted (potentially with one-use trunc inbetween),
   // and if so look past one-use truncation.
   MVT XVT = NVT;
   if (!Subtarget->hasBMI2() && X.getOpcode() == ISD::TRUNCATE &&
       X.hasOneUse() && X.getOperand(0).getOpcode() == ISD::SRL) {
     assert(NVT == MVT::i32 && "Expected target valuetype to be i32");
     X = X.getOperand(0);
     XVT = X.getSimpleValueType();
     assert(XVT == MVT::i64 && "Expected truncation from i64");
   }
 
   SDValue OrigNBits = NBits;
   if (NBits.getValueType() != XVT) {
     // Truncate the shift amount.
     NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
     insertDAGNode(*CurDAG, OrigNBits, NBits);
 
     // Insert 8-bit NBits into lowest 8 bits of XVT-sized (32 or 64-bit)
     // register. All the other bits are undefined, we do not care about them.
     SDValue ImplDef =
         SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, XVT), 0);
     insertDAGNode(*CurDAG, OrigNBits, ImplDef);
     NBits =
         CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, XVT, ImplDef, NBits);
     insertDAGNode(*CurDAG, OrigNBits, NBits);
   }
 
   if (Subtarget->hasBMI2()) {
     // Great, just emit the the BZHI..
     SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, XVT, X, NBits);
     ReplaceNode(Node, Extract.getNode());
     SelectCode(Extract.getNode());
     return true;
   }
 
   // Else, emitting BEXTR requires one more step.
   // The 'control' of BEXTR has the pattern of:
   // [15...8 bit][ 7...0 bit] location
   // [ bit count][     shift] name
   // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
 
   // Shift NBits left by 8 bits, thus producing 'control'.
   // This makes the low 8 bits to be zero.
   SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
   SDValue Control = CurDAG->getNode(ISD::SHL, DL, XVT, NBits, C8);
   insertDAGNode(*CurDAG, OrigNBits, Control);
 
   // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
   if (X.getOpcode() == ISD::SRL) {
     SDValue ShiftAmt = X.getOperand(1);
     X = X.getOperand(0);
 
     assert(ShiftAmt.getValueType() == MVT::i8 &&
            "Expected shift amount to be i8");
 
     // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
     SDValue OrigShiftAmt = ShiftAmt;
     ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, XVT, ShiftAmt);
     insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
 
     // And now 'or' these low 8 bits of shift amount into the 'control'.
     Control = CurDAG->getNode(ISD::OR, DL, XVT, Control, ShiftAmt);
     insertDAGNode(*CurDAG, OrigNBits, Control);
   }
 
   // And finally, form the BEXTR itself.
   SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
 
   // The 'X' was originally truncated. Do that now.
   if (XVT != NVT) {
     insertDAGNode(*CurDAG, OrigNBits, Extract);
     Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
   }
 
   ReplaceNode(Node, Extract.getNode());
   SelectCode(Extract.getNode());
 
   return true;
 }
 
 // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
 MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   SDLoc dl(Node);
 
   SDValue N0 = Node->getOperand(0);
   SDValue N1 = Node->getOperand(1);
 
   // If we have TBM we can use an immediate for the control. If we have BMI
   // we should only do this if the BEXTR instruction is implemented well.
   // Otherwise moving the control into a register makes this more costly.
   // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
   // hoisting the move immediate would make it worthwhile with a less optimal
   // BEXTR?
   if (!Subtarget->hasTBM() &&
       !(Subtarget->hasBMI() && Subtarget->hasFastBEXTR()))
     return nullptr;
 
   // Must have a shift right.
   if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
     return nullptr;
 
   // Shift can't have additional users.
   if (!N0->hasOneUse())
     return nullptr;
 
   // Only supported for 32 and 64 bits.
   if (NVT != MVT::i32 && NVT != MVT::i64)
     return nullptr;
 
   // Shift amount and RHS of and must be constant.
   ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
   ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
   if (!MaskCst || !ShiftCst)
     return nullptr;
 
   // And RHS must be a mask.
   uint64_t Mask = MaskCst->getZExtValue();
   if (!isMask_64(Mask))
     return nullptr;
 
   uint64_t Shift = ShiftCst->getZExtValue();
   uint64_t MaskSize = countPopulation(Mask);
 
   // Don't interfere with something that can be handled by extracting AH.
   // TODO: If we are able to fold a load, BEXTR might still be better than AH.
   if (Shift == 8 && MaskSize == 8)
     return nullptr;
 
   // Make sure we are only using bits that were in the original value, not
   // shifted in.
   if (Shift + MaskSize > NVT.getSizeInBits())
     return nullptr;
 
   SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
   unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
   unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
 
   // BMI requires the immediate to placed in a register.
   if (!Subtarget->hasTBM()) {
     ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
     MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
     unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
     New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
   }
 
   MachineSDNode *NewNode;
   SDValue Input = N0->getOperand(0);
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
     SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
     NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     // Update the chain.
     ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
     // Record the mem-refs
     CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
   } else {
     NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
   }
 
   return NewNode;
 }
 
 // Emit a PCMISTR(I/M) instruction.
 MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
                                              bool MayFoldLoad, const SDLoc &dl,
                                              MVT VT, SDNode *Node) {
   SDValue N0 = Node->getOperand(0);
   SDValue N1 = Node->getOperand(1);
   SDValue Imm = Node->getOperand(2);
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
   // Try to fold a load. No need to check alignment.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
                       N1.getOperand(0) };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     // Update the chain.
     ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     return CNode;
   }
 
   SDValue Ops[] = { N0, N1, Imm };
   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
   return CNode;
 }
 
 // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
 // to emit a second instruction after this one. This is needed since we have two
 // copyToReg nodes glued before this and we need to continue that glue through.
 MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
                                              bool MayFoldLoad, const SDLoc &dl,
                                              MVT VT, SDNode *Node,
                                              SDValue &InFlag) {
   SDValue N0 = Node->getOperand(0);
   SDValue N2 = Node->getOperand(2);
   SDValue Imm = Node->getOperand(4);
   const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
   Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
 
   // Try to fold a load. No need to check alignment.
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
     SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
                       N2.getOperand(0), InFlag };
     SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
     MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
     InFlag = SDValue(CNode, 3);
     // Update the chain.
     ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
     // Record the mem-refs
     CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
     return CNode;
   }
 
   SDValue Ops[] = { N0, N2, Imm, InFlag };
   SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
   MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
   InFlag = SDValue(CNode, 2);
   return CNode;
 }
 
 bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // Only handle scalar shifts.
   if (VT.isVector())
     return false;
 
   // Narrower shifts only mask to 5 bits in hardware.
   unsigned Size = VT == MVT::i64 ? 64 : 32;
 
   SDValue OrigShiftAmt = N->getOperand(1);
   SDValue ShiftAmt = OrigShiftAmt;
   SDLoc DL(N);
 
   // Skip over a truncate of the shift amount.
   if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
     ShiftAmt = ShiftAmt->getOperand(0);
 
   // This function is called after X86DAGToDAGISel::matchBitExtract(),
   // so we are not afraid that we might mess up BZHI/BEXTR pattern.
 
   SDValue NewShiftAmt;
   if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) {
     SDValue Add0 = ShiftAmt->getOperand(0);
     SDValue Add1 = ShiftAmt->getOperand(1);
     // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
     // to avoid the ADD/SUB.
     if (isa<ConstantSDNode>(Add1) &&
         cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0) {
       NewShiftAmt = Add0;
     // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
     // generate a NEG instead of a SUB of a constant.
     } else if (ShiftAmt->getOpcode() == ISD::SUB &&
                isa<ConstantSDNode>(Add0) &&
                cast<ConstantSDNode>(Add0)->getZExtValue() != 0 &&
                cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 0) {
       // Insert a negate op.
       // TODO: This isn't guaranteed to replace the sub if there is a logic cone
       // that uses it that's not a shift.
       EVT SubVT = ShiftAmt.getValueType();
       SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
       SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
       NewShiftAmt = Neg;
 
       // Insert these operands into a valid topological order so they can
       // get selected independently.
       insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
       insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
     } else
       return false;
   } else
     return false;
 
   if (NewShiftAmt.getValueType() != MVT::i8) {
     // Need to truncate the shift amount.
     NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
     // Add to a correct topological ordering.
     insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
   }
 
   // Insert a new mask to keep the shift amount legal. This should be removed
   // by isel patterns.
   NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
                                 CurDAG->getConstant(Size - 1, DL, MVT::i8));
   // Place in a correct topological ordering.
   insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
 
   SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
                                                    NewShiftAmt);
   if (UpdatedNode != N) {
     // If we found an existing node, we should replace ourselves with that node
     // and wait for it to be selected after its other users.
     ReplaceNode(N, UpdatedNode);
     return true;
   }
 
   // If the original shift amount is now dead, delete it so that we don't run
   // it through isel.
   if (OrigShiftAmt.getNode()->use_empty())
     CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
 
   // Now that we've optimized the shift amount, defer to normal isel to get
   // load folding and legacy vs BMI2 selection without repeating it here.
   SelectCode(N);
   return true;
 }
 
 /// If the high bits of an 'and' operand are known zero, try setting the
 /// high bits of an 'and' constant operand to produce a smaller encoding by
 /// creating a small, sign-extended negative immediate rather than a large
 /// positive one. This reverses a transform in SimplifyDemandedBits that
 /// shrinks mask constants by clearing bits. There is also a possibility that
 /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
 /// case, just replace the 'and'. Return 'true' if the node is replaced.
 bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
   // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
   // have immediate operands.
   MVT VT = And->getSimpleValueType(0);
   if (VT != MVT::i32 && VT != MVT::i64)
     return false;
 
   auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
   if (!And1C)
     return false;
 
   // Bail out if the mask constant is already negative. It's can't shrink more.
   // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
   // patterns to use a 32-bit and instead of a 64-bit and by relying on the
   // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
   // are negative too.
   APInt MaskVal = And1C->getAPIntValue();
   unsigned MaskLZ = MaskVal.countLeadingZeros();
   if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32))
     return false;
 
   // Don't extend into the upper 32 bits of a 64 bit mask.
   if (VT == MVT::i64 && MaskLZ >= 32) {
     MaskLZ -= 32;
     MaskVal = MaskVal.trunc(32);
   }
 
   SDValue And0 = And->getOperand(0);
   APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
   APInt NegMaskVal = MaskVal | HighZeros;
 
   // If a negative constant would not allow a smaller encoding, there's no need
   // to continue. Only change the constant when we know it's a win.
   unsigned MinWidth = NegMaskVal.getMinSignedBits();
   if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32))
     return false;
 
   // Extend masks if we truncated above.
   if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) {
     NegMaskVal = NegMaskVal.zext(64);
     HighZeros = HighZeros.zext(64);
   }
 
   // The variable operand must be all zeros in the top bits to allow using the
   // new, negative constant as the mask.
   if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
     return false;
 
   // Check if the mask is -1. In that case, this is an unnecessary instruction
   // that escaped earlier analysis.
   if (NegMaskVal.isAllOnesValue()) {
     ReplaceNode(And, And0.getNode());
     return true;
   }
 
   // A negative mask allows a smaller encoding. Create a new 'and' node.
   SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
   SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
   ReplaceNode(And, NewAnd.getNode());
   SelectCode(NewAnd.getNode());
   return true;
 }
 
 void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opcode = Node->getOpcode();
   SDLoc dl(Node);
 
   if (Node->isMachineOpcode()) {
     LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
     Node->setNodeId(-1);
     return;   // Already selected.
   }
 
   switch (Opcode) {
   default: break;
   case ISD::BRIND: {
     if (Subtarget->isTargetNaCl())
       // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
       // leave the instruction alone.
       break;
     if (Subtarget->isTarget64BitILP32()) {
       // Converts a 32-bit register to a 64-bit, zero-extended version of
       // it. This is needed because x86-64 can do many things, but jmp %r32
       // ain't one of them.
       const SDValue &Target = Node->getOperand(1);
       assert(Target.getSimpleValueType() == llvm::MVT::i32);
       SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
       SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
                                       Node->getOperand(0), ZextTarget);
       ReplaceNode(Node, Brind.getNode());
       SelectCode(ZextTarget.getNode());
       SelectCode(Brind.getNode());
       return;
     }
     break;
   }
   case X86ISD::GlobalBaseReg:
     ReplaceNode(Node, getGlobalBaseReg());
     return;
 
   case ISD::BITCAST:
     // Just drop all 128/256/512-bit bitcasts.
     if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
         NVT == MVT::f128) {
       ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
       CurDAG->RemoveDeadNode(Node);
       return;
     }
     break;
 
   case X86ISD::BLENDV: {
     // BLENDV selects like a regular VSELECT.
     SDValue VSelect = CurDAG->getNode(
         ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
         Node->getOperand(1), Node->getOperand(2));
     ReplaceNode(Node, VSelect.getNode());
     SelectCode(VSelect.getNode());
     // We already called ReplaceUses.
     return;
   }
 
   case ISD::SRL:
     if (matchBitExtract(Node))
       return;
     LLVM_FALLTHROUGH;
   case ISD::SRA:
   case ISD::SHL:
     if (tryShiftAmountMod(Node))
       return;
     break;
 
   case ISD::AND:
     if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
       ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
       CurDAG->RemoveDeadNode(Node);
       return;
     }
     if (matchBitExtract(Node))
       return;
     if (AndImmShrink && shrinkAndImmediate(Node))
       return;
 
     LLVM_FALLTHROUGH;
   case ISD::OR:
   case ISD::XOR: {
 
     // For operations of the form (x << C1) op C2, check if we can use a smaller
     // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
     if (N0->getOpcode() != ISD::SHL || !N0->hasOneUse())
       break;
 
     // i8 is unshrinkable, i16 should be promoted to i32.
     if (NVT != MVT::i32 && NVT != MVT::i64)
       break;
 
     ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
     ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     if (!Cst || !ShlCst)
       break;
 
     int64_t Val = Cst->getSExtValue();
     uint64_t ShlVal = ShlCst->getZExtValue();
 
     // Make sure that we don't change the operation by removing bits.
     // This only matters for OR and XOR, AND is unaffected.
     uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
     if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
       break;
 
     unsigned ShlOp, AddOp, Op;
     MVT CstVT = NVT;
 
     // Check the minimum bitwidth for the new constant.
     // TODO: AND32ri is the same as AND64ri32 with zext imm.
     // TODO: MOV32ri+OR64r is cheaper than MOV64ri64+OR64rr
     // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
     if (!isInt<8>(Val) && isInt<8>(Val >> ShlVal))
       CstVT = MVT::i8;
     else if (!isInt<32>(Val) && isInt<32>(Val >> ShlVal))
       CstVT = MVT::i32;
 
     // Bail if there is no smaller encoding.
     if (NVT == CstVT)
       break;
 
     switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i32:
       assert(CstVT == MVT::i8);
       ShlOp = X86::SHL32ri;
       AddOp = X86::ADD32rr;
 
       switch (Opcode) {
       default: llvm_unreachable("Impossible opcode");
       case ISD::AND: Op = X86::AND32ri8; break;
       case ISD::OR:  Op =  X86::OR32ri8; break;
       case ISD::XOR: Op = X86::XOR32ri8; break;
       }
       break;
     case MVT::i64:
       assert(CstVT == MVT::i8 || CstVT == MVT::i32);
       ShlOp = X86::SHL64ri;
       AddOp = X86::ADD64rr;
 
       switch (Opcode) {
       default: llvm_unreachable("Impossible opcode");
       case ISD::AND: Op = CstVT==MVT::i8? X86::AND64ri8 : X86::AND64ri32; break;
       case ISD::OR:  Op = CstVT==MVT::i8?  X86::OR64ri8 :  X86::OR64ri32; break;
       case ISD::XOR: Op = CstVT==MVT::i8? X86::XOR64ri8 : X86::XOR64ri32; break;
       }
       break;
     }
 
     // Emit the smaller op and the shift.
     SDValue NewCst = CurDAG->getTargetConstant(Val >> ShlVal, dl, CstVT);
     SDNode *New = CurDAG->getMachineNode(Op, dl, NVT, N0->getOperand(0),NewCst);
     if (ShlVal == 1)
       CurDAG->SelectNodeTo(Node, AddOp, NVT, SDValue(New, 0),
                            SDValue(New, 0));
     else
       CurDAG->SelectNodeTo(Node, ShlOp, NVT, SDValue(New, 0),
                            getI8Imm(ShlVal, dl));
     return;
   }
   case X86ISD::SMUL:
     // i16/i32/i64 are handled with isel patterns.
     if (NVT != MVT::i8)
       break;
     LLVM_FALLTHROUGH;
   case X86ISD::UMUL: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
     unsigned LoReg, ROpc, MOpc;
     switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:
       LoReg = X86::AL;
       ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
       MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
       break;
     case MVT::i16:
       LoReg = X86::AX;
       ROpc = X86::MUL16r;
       MOpc = X86::MUL16m;
       break;
     case MVT::i32:
       LoReg = X86::EAX;
       ROpc = X86::MUL32r;
       MOpc = X86::MUL32m;
       break;
     case MVT::i64:
       LoReg = X86::RAX;
       ROpc = X86::MUL64r;
       MOpc = X86::MUL64m;
       break;
     }
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
     bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
     // Multiply is commmutative.
     if (!FoldedLoad) {
       FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
       if (FoldedLoad)
         std::swap(N0, N1);
     }
 
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                           N0, SDValue()).getValue(1);
 
     MachineSDNode *CNode;
     if (FoldedLoad) {
       // i16/i32/i64 use an instruction that produces a low and high result even
       // though only the low result is used.
       SDVTList VTs;
       if (NVT == MVT::i8)
         VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
       else
         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
 
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
 
       // Update the chain.
       ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
       // Record the mem-refs
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       // i16/i32/i64 use an instruction that produces a low and high result even
       // though only the low result is used.
       SDVTList VTs;
       if (NVT == MVT::i8)
         VTs = CurDAG->getVTList(NVT, MVT::i32);
       else
         VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
 
       CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
     }
 
     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
     ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
     CurDAG->RemoveDeadNode(Node);
     return;
   }
 
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
     unsigned Opc, MOpc;
     bool isSigned = Opcode == ISD::SMUL_LOHI;
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
       case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
       }
     } else {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
       case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
       }
     }
 
     unsigned SrcReg, LoReg, HiReg;
     switch (Opc) {
     default: llvm_unreachable("Unknown MUL opcode!");
     case X86::IMUL32r:
     case X86::MUL32r:
       SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
       break;
     case X86::IMUL64r:
     case X86::MUL64r:
       SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
       break;
     }
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
     // Multiply is commmutative.
     if (!foldedLoad) {
       foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
       if (foldedLoad)
         std::swap(N0, N1);
     }
 
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
                                           N0, SDValue()).getValue(1);
     if (foldedLoad) {
       SDValue Chain;
       MachineSDNode *CNode = nullptr;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
       SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
       CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
       Chain = SDValue(CNode, 0);
       InFlag = SDValue(CNode, 1);
 
       // Update the chain.
       ReplaceUses(N1.getValue(1), Chain);
       // Record the mem-refs
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       SDValue Ops[] = { N1, InFlag };
       SDVTList VTs = CurDAG->getVTList(MVT::Glue);
       SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
       InFlag = SDValue(CNode, 0);
     }
 
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
       assert(LoReg && "Register for low half is not defined!");
       SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
                                              NVT, InFlag);
       InFlag = ResLo.getValue(2);
       ReplaceUses(SDValue(Node, 0), ResLo);
       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
                  dbgs() << '\n');
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
       assert(HiReg && "Register for high half is not defined!");
       SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
                                              NVT, InFlag);
       InFlag = ResHi.getValue(2);
       ReplaceUses(SDValue(Node, 1), ResHi);
       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
                  dbgs() << '\n');
     }
 
     CurDAG->RemoveDeadNode(Node);
     return;
   }
 
   case ISD::SDIVREM:
   case ISD::UDIVREM: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
     unsigned Opc, MOpc;
     bool isSigned = Opcode == ISD::SDIVREM;
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
       case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
       case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
       case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
       }
     } else {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
       case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
       case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
       case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
       }
     }
 
     unsigned LoReg, HiReg, ClrReg;
     unsigned SExtOpcode;
     switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:
       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
       SExtOpcode = X86::CBW;
       break;
     case MVT::i16:
       LoReg = X86::AX;  HiReg = X86::DX;
       ClrReg = X86::DX;
       SExtOpcode = X86::CWD;
       break;
     case MVT::i32:
       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
       SExtOpcode = X86::CDQ;
       break;
     case MVT::i64:
       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
       SExtOpcode = X86::CQO;
       break;
     }
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
     bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
     bool signBitIsZero = CurDAG->SignBitIsZero(N0);
 
     SDValue InFlag;
     if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
       // Special case for div8, just use a move with zero extension to AX to
       // clear the upper 8 bits (AH).
       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
       MachineSDNode *Move;
       if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
         Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
                                       MVT::Other, Ops);
         Chain = SDValue(Move, 1);
         ReplaceUses(N0.getValue(1), Chain);
         // Record the mem-refs
         CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
       } else {
         Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
         Chain = CurDAG->getEntryNode();
       }
       Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
                                     SDValue());
       InFlag = Chain.getValue(1);
     } else {
       InFlag =
         CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
                              LoReg, N0, SDValue()).getValue(1);
       if (isSigned && !signBitIsZero) {
         // Sign extend the low part into the high part.
         InFlag =
           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
       } else {
         // Zero out the high part, effectively zero extending the input.
         SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
         switch (NVT.SimpleTy) {
         case MVT::i16:
           ClrNode =
               SDValue(CurDAG->getMachineNode(
                           TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
                           CurDAG->getTargetConstant(X86::sub_16bit, dl,
                                                     MVT::i32)),
                       0);
           break;
         case MVT::i32:
           break;
         case MVT::i64:
           ClrNode =
               SDValue(CurDAG->getMachineNode(
                           TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
                           CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
                           CurDAG->getTargetConstant(X86::sub_32bit, dl,
                                                     MVT::i32)),
                       0);
           break;
         default:
           llvm_unreachable("Unexpected division source");
         }
 
         InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
                                       ClrNode, InFlag).getValue(1);
       }
     }
 
     if (foldedLoad) {
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
       MachineSDNode *CNode =
         CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
       InFlag = SDValue(CNode, 1);
       // Update the chain.
       ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
       // Record the mem-refs
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       InFlag =
         SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
     }
 
     // Prevent use of AH in a REX instruction by explicitly copying it to
     // an ABCD_L register.
     //
     // The current assumption of the register allocator is that isel
     // won't generate explicit references to the GR8_ABCD_H registers. If
     // the allocator and/or the backend get enhanced to be more robust in
     // that regard, this can be, and should be, removed.
     if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
       SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
       unsigned AHExtOpcode =
           isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;
 
       SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
                                              MVT::Glue, AHCopy, InFlag);
       SDValue Result(RNode, 0);
       InFlag = SDValue(RNode, 1);
 
       Result =
           CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
 
       ReplaceUses(SDValue(Node, 1), Result);
       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
                  dbgs() << '\n');
     }
     // Copy the division (low) result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
                                                 LoReg, NVT, InFlag);
       InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 0), Result);
       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
                  dbgs() << '\n');
     }
     // Copy the remainder (high) result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
                                               HiReg, NVT, InFlag);
       InFlag = Result.getValue(2);
       ReplaceUses(SDValue(Node, 1), Result);
       LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
                  dbgs() << '\n');
     }
     CurDAG->RemoveDeadNode(Node);
     return;
   }
 
   case X86ISD::CMP: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
     // Optimizations for TEST compares.
     if (!isNullConstant(N1))
       break;
 
     // Save the original VT of the compare.
     MVT CmpVT = N0.getSimpleValueType();
 
     // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
     // by a test instruction. The test should be removed later by
     // analyzeCompare if we are using only the zero flag.
     // TODO: Should we check the users and use the BEXTR flags directly?
     if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
       if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
         unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
                                              : X86::TEST32rr;
         SDValue BEXTR = SDValue(NewNode, 0);
         NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
         ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
         CurDAG->RemoveDeadNode(Node);
         return;
       }
     }
 
     // We can peek through truncates, but we need to be careful below.
     if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
       N0 = N0.getOperand(0);
 
     // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
     // use a smaller encoding.
     // Look past the truncate if CMP is the only use of it.
     if (N0.getOpcode() == ISD::AND &&
         N0.getNode()->hasOneUse() &&
         N0.getValueType() != MVT::i8) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C) break;
       uint64_t Mask = C->getZExtValue();
 
       // Check if we can replace AND+IMM64 with a shift. This is possible for
       // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
       // flag.
       if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
           onlyUsesZeroFlag(SDValue(Node, 0))) {
         if (isMask_64(~Mask)) {
           unsigned TrailingZeros = countTrailingZeros(Mask);
           SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
           SDValue Shift =
             SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64,
                                            N0.getOperand(0), Imm), 0);
           MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
                                                        MVT::i32, Shift, Shift);
           ReplaceNode(Node, Test);
           return;
         }
         if (isMask_64(Mask)) {
           unsigned LeadingZeros = countLeadingZeros(Mask);
           SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
           SDValue Shift =
             SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64,
                                            N0.getOperand(0), Imm), 0);
           MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
                                                        MVT::i32, Shift, Shift);
           ReplaceNode(Node, Test);
           return;
         }
       }
 
       MVT VT;
       int SubRegOp;
       unsigned ROpc, MOpc;
 
       // For each of these checks we need to be careful if the sign flag is
       // being used. It is only safe to use the sign flag in two conditions,
       // either the sign bit in the shrunken mask is zero or the final test
       // size is equal to the original compare size.
 
       if (isUInt<8>(Mask) &&
           (!(Mask & 0x80) || CmpVT == MVT::i8 ||
            hasNoSignFlagUses(SDValue(Node, 0)))) {
         // For example, convert "testl %eax, $8" to "testb %al, $8"
         VT = MVT::i8;
         SubRegOp = X86::sub_8bit;
         ROpc = X86::TEST8ri;
         MOpc = X86::TEST8mi;
       } else if (OptForMinSize && isUInt<16>(Mask) &&
                  (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
                   hasNoSignFlagUses(SDValue(Node, 0)))) {
         // For example, "testl %eax, $32776" to "testw %ax, $32776".
         // NOTE: We only want to form TESTW instructions if optimizing for
         // min size. Otherwise we only save one byte and possibly get a length
         // changing prefix penalty in the decoders.
         VT = MVT::i16;
         SubRegOp = X86::sub_16bit;
         ROpc = X86::TEST16ri;
         MOpc = X86::TEST16mi;
       } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
                  ((!(Mask & 0x80000000) &&
                    // Without minsize 16-bit Cmps can get here so we need to
                    // be sure we calculate the correct sign flag if needed.
                    (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
                   CmpVT == MVT::i32 ||
                   hasNoSignFlagUses(SDValue(Node, 0)))) {
         // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
         // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
         // Otherwize, we find ourselves in a position where we have to do
         // promotion. If previous passes did not promote the and, we assume
         // they had a good reason not to and do not promote here.
         VT = MVT::i32;
         SubRegOp = X86::sub_32bit;
         ROpc = X86::TEST32ri;
         MOpc = X86::TEST32mi;
       } else {
         // No eligible transformation was found.
         break;
       }
 
       // FIXME: We should be able to fold loads here.
 
       SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
       SDValue Reg = N0.getOperand(0);
 
       // Emit a testl or testw.
       MachineSDNode *NewNode;
       SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
       if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
         SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
                           Reg.getOperand(0) };
         NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
         // Update the chain.
         ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
         // Record the mem-refs
         CurDAG->setNodeMemRefs(NewNode,
                                {cast<LoadSDNode>(Reg)->getMemOperand()});
       } else {
         // Extract the subregister if necessary.
         if (N0.getValueType() != VT)
           Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
 
         NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
       }
       // Replace CMP with TEST.
       ReplaceNode(Node, NewNode);
       return;
     }
     break;
   }
   case X86ISD::PCMPISTR: {
     if (!Subtarget->hasSSE42())
       break;
 
     bool NeedIndex = !SDValue(Node, 0).use_empty();
     bool NeedMask = !SDValue(Node, 1).use_empty();
     // We can't fold a load if we are going to make two instructions.
     bool MayFoldLoad = !NeedIndex || !NeedMask;
 
     MachineSDNode *CNode;
     if (NeedMask) {
       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
     }
     if (NeedIndex || !NeedMask) {
       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
       CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
     }
 
     // Connect the flag usage to the last instruction created.
     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
     CurDAG->RemoveDeadNode(Node);
     return;
   }
   case X86ISD::PCMPESTR: {
     if (!Subtarget->hasSSE42())
       break;
 
     // Copy the two implicit register inputs.
     SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
                                           Node->getOperand(1),
                                           SDValue()).getValue(1);
     InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
                                   Node->getOperand(3), InFlag).getValue(1);
 
     bool NeedIndex = !SDValue(Node, 0).use_empty();
     bool NeedMask = !SDValue(Node, 1).use_empty();
     // We can't fold a load if we are going to make two instructions.
     bool MayFoldLoad = !NeedIndex || !NeedMask;
 
     MachineSDNode *CNode;
     if (NeedMask) {
       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
                            InFlag);
       ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
     }
     if (NeedIndex || !NeedMask) {
       unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
       unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
       CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
       ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
     }
     // Connect the flag usage to the last instruction created.
     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
     CurDAG->RemoveDeadNode(Node);
     return;
   }
 
   case ISD::STORE:
     if (foldLoadStoreIntoMemOperand(Node))
       return;
     break;
   }
 
   SelectCode(Node);
 }
 
 bool X86DAGToDAGISel::
 SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
                              std::vector<SDValue> &OutOps) {
   SDValue Op0, Op1, Op2, Op3, Op4;
   switch (ConstraintID) {
   default:
     llvm_unreachable("Unexpected asm memory constraint");
   case InlineAsm::Constraint_i:
     // FIXME: It seems strange that 'i' is needed here since it's supposed to
     //        be an immediate and not a memory constraint.
     LLVM_FALLTHROUGH;
   case InlineAsm::Constraint_o: // offsetable        ??
   case InlineAsm::Constraint_v: // not offsetable    ??
   case InlineAsm::Constraint_m: // memory
   case InlineAsm::Constraint_X:
     if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
       return true;
     break;
   }
 
   OutOps.push_back(Op0);
   OutOps.push_back(Op1);
   OutOps.push_back(Op2);
   OutOps.push_back(Op3);
   OutOps.push_back(Op4);
   return false;
 }
 
 /// This pass converts a legalized DAG into a X86-specific DAG,
 /// ready for instruction scheduling.
 FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
                                      CodeGenOpt::Level OptLevel) {
   return new X86DAGToDAGISel(TM, OptLevel);
 }
Index: vendor/llvm/dist-release_80/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- vendor/llvm/dist-release_80/lib/Target/X86/X86ISelLowering.cpp	(revision 344764)
+++ vendor/llvm/dist-release_80/lib/Target/X86/X86ISelLowering.cpp	(revision 344765)
@@ -1,42747 +1,42750 @@
 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the interfaces that X86 uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86CallingConv.h"
 #include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86IntrinsicsInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <bitset>
 #include <cctype>
 #include <numeric>
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-isel"
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 static cl::opt<bool> ExperimentalVectorWideningLegalization(
     "x86-experimental-vector-widening-legalization", cl::init(false),
     cl::desc("Enable an experimental vector type legalization through widening "
              "rather than promotion."),
     cl::Hidden);
 
 static cl::opt<int> ExperimentalPrefLoopAlignment(
     "x86-experimental-pref-loop-alignment", cl::init(4),
     cl::desc("Sets the preferable loop alignment for experiments "
              "(the last x86-experimental-pref-loop-alignment bits"
              " of the loop header PC will be 0)."),
     cl::Hidden);
 
 static cl::opt<bool> MulConstantOptimization(
     "mul-constant-optimization", cl::init(true),
     cl::desc("Replace 'mul x, Const' with more effective instructions like "
              "SHIFT, LEA, etc."),
     cl::Hidden);
 
 /// Call this when the user attempts to do something unsupported, like
 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
 /// report_fatal_error, so calling code should attempt to recover without
 /// crashing.
 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
                              const char *Msg) {
   MachineFunction &MF = DAG.getMachineFunction();
   DAG.getContext()->diagnose(
       DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
 }
 
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
   X86ScalarSSEf64 = Subtarget.hasSSE2();
   X86ScalarSSEf32 = Subtarget.hasSSE1();
   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
 
   // Set up the TargetLowering object.
 
   // X86 is weird. It always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   // For 64-bit, since we have so many registers, use the ILP scheduler.
   // For 32-bit, use the register pressure specific scheduling.
   // For Atom, always use ILP scheduling.
   if (Subtarget.isAtom())
     setSchedulingPreference(Sched::ILP);
   else if (Subtarget.is64Bit())
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
   // Bypass expensive divides and use cheaper ones.
   if (TM.getOptLevel() >= CodeGenOpt::Default) {
     if (Subtarget.hasSlowDivide32())
       addBypassSlowDiv(32, 8);
     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
       addBypassSlowDiv(64, 32);
   }
 
   if (Subtarget.isTargetKnownWindowsMSVC() ||
       Subtarget.isTargetWindowsItanium()) {
     // Setup Windows compiler runtime calls.
     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
     setLibcallName(RTLIB::SREM_I64, "_allrem");
     setLibcallName(RTLIB::UREM_I64, "_aullrem");
     setLibcallName(RTLIB::MUL_I64, "_allmul");
     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
   }
 
   if (Subtarget.isTargetDarwin()) {
     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
     setUseUnderscoreSetJmp(false);
     setUseUnderscoreLongJmp(false);
   } else if (Subtarget.isTargetWindowsGNU()) {
     // MS runtime is weird: it exports _setjmp, but longjmp!
     setUseUnderscoreSetJmp(true);
     setUseUnderscoreLongJmp(false);
   } else {
     setUseUnderscoreSetJmp(true);
     setUseUnderscoreLongJmp(true);
   }
 
   // Set up the register classes.
   addRegisterClass(MVT::i8, &X86::GR8RegClass);
   addRegisterClass(MVT::i16, &X86::GR16RegClass);
   addRegisterClass(MVT::i32, &X86::GR32RegClass);
   if (Subtarget.is64Bit())
     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 
   for (MVT VT : MVT::integer_valuetypes())
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
   // We don't accept any truncstore of integer registers.
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // SETOEQ and SETUNE require checking two conditions.
   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 
   // Integer absolute.
   if (Subtarget.hasCMov()) {
     setOperationAction(ISD::ABS            , MVT::i16  , Custom);
     setOperationAction(ISD::ABS            , MVT::i32  , Custom);
     if (Subtarget.is64Bit())
       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
   }
 
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
     setOperationAction(ShiftOp             , MVT::i16  , Custom);
     setOperationAction(ShiftOp             , MVT::i32  , Custom);
     if (Subtarget.is64Bit())
       setOperationAction(ShiftOp           , MVT::i64  , Custom);
   }
 
   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
   // operation.
   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 
   if (Subtarget.is64Bit()) {
     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
       // f32/f64 are legal, f80 is custom.
       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
     else
       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
   } else if (!Subtarget.useSoftFloat()) {
     // We have an algorithm for SSE2->double, and we turn this into a
     // 64-bit FILD followed by conditional FADD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
     // We have an algorithm for SSE2, and we turn this into a 64-bit
     // FILD or VCVTUSI2SS/SD for other targets.
     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
   } else {
     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Expand);
   }
 
   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
   // this operation.
   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 
   if (!Subtarget.useSoftFloat()) {
     // SSE has no i16 to fp conversion, only i32.
     if (X86ScalarSSEf32) {
       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
       // f32 and f64 cases are Legal, f80 case is not
       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
     } else {
       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
     }
   } else {
     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Expand);
   }
 
   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
   // this operation.
   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 
   if (!Subtarget.useSoftFloat()) {
     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
     // are Legal, f80 is custom lowered.
     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 
     if (X86ScalarSSEf32) {
       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
       // f32 and f64 cases are Legal, f80 case is not
       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
     } else {
       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
     }
   } else {
     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
   }
 
   // Handle FP_TO_UINT by promoting the destination to a larger signed
   // conversion.
   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 
   if (Subtarget.is64Bit()) {
     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
     } else {
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
     }
   } else if (!Subtarget.useSoftFloat()) {
     // Since AVX is a superset of SSE3, only check for SSE here.
     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
       // Expand FP_TO_UINT into a select.
       // FIXME: We would like to use a Custom expander here eventually to do
       // the optimal thing for SSE vs. the default expansion in the legalizer.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
     else
       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
       // With SSE3 we can use fisttpll to convert to a signed i64; without
       // SSE, we're stuck with a fistpll.
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 
     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
   }
 
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
   if (!X86ScalarSSEf64) {
     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
       // Without SSE, i64->f64 goes through memory.
       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
     }
   } else if (!Subtarget.is64Bit())
     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 
   // Scalar integer divide and remainder are lowered to use operations that
   // produce two results, to match the available instructions. This exposes
   // the two-result form to trivial CSE, which is able to combine x/y and x%y
   // into a single instruction.
   //
   // Scalar integer multiply-high is also lowered to use two-result
   // operations, to match the available instructions. However, plain multiply
   // (low) operations are left as Legal, as there are single-result
   // instructions for this in x86. Using the two-result multiply instructions
   // when both high and low results are needed must be arranged by dagcombine.
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
   }
 
   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::BR_CC,     VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
   }
   if (Subtarget.is64Bit())
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 
   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 
   // Promote the i8 variants and force them on up to i32 which has a shorter
   // encoding.
   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
   if (!Subtarget.hasBMI()) {
     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
     }
   }
 
   if (Subtarget.hasLZCNT()) {
     // When promoting the i8 variants, force them to i32 for a shorter
     // encoding.
     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
   } else {
     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
     }
   }
 
   // Special handling for half-precision floating point conversions.
   // If we don't have F16C support, then lower half float conversions
   // into library calls.
   if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
   }
 
   // There's never any support for operations beyond MVT::f32.
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 
   if (Subtarget.hasPOPCNT()) {
     setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
   } else {
     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
     if (Subtarget.is64Bit())
       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
   }
 
   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 
   if (!Subtarget.hasMOVBE())
     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 
   // These should be promoted to a larger select which is supported.
   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
   // X86 wants to expand cmov itself.
   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::SETCC, VT, Custom);
   }
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SELECT, VT, Custom);
     setOperationAction(ISD::SETCC,  VT, Custom);
   }
 
   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
   // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 
   // Darwin ABI issue.
   for (auto VT : { MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::ConstantPool    , VT, Custom);
     setOperationAction(ISD::JumpTable       , VT, Custom);
     setOperationAction(ISD::GlobalAddress   , VT, Custom);
     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
     setOperationAction(ISD::BlockAddress    , VT, Custom);
   }
 
   // 64-bit shl, sra, srl (iff 32-bit x86)
   for (auto VT : { MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     setOperationAction(ISD::SHL_PARTS, VT, Custom);
     setOperationAction(ISD::SRA_PARTS, VT, Custom);
     setOperationAction(ISD::SRL_PARTS, VT, Custom);
   }
 
   if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 
   // Expand certain atomics
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
   }
 
   if (Subtarget.hasCmpxchg16b()) {
     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
   }
 
   // FIXME - use subtarget debug flags
   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
   }
 
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
   bool Is64Bit = Subtarget.is64Bit();
   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 
   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 
   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
                                                      : &X86::FR32RegClass);
     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
                                                      : &X86::FR64RegClass);
 
     for (auto VT : { MVT::f32, MVT::f64 }) {
       // Use ANDPD to simulate FABS.
       setOperationAction(ISD::FABS, VT, Custom);
 
       // Use XORP to simulate FNEG.
       setOperationAction(ISD::FNEG, VT, Custom);
 
       // Use ANDPD and ORPD to simulate FCOPYSIGN.
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 
       // These might be better off as horizontal vector ops.
       setOperationAction(ISD::FADD, VT, Custom);
       setOperationAction(ISD::FSUB, VT, Custom);
 
       // We don't support sin/cos/fmod
       setOperationAction(ISD::FSIN   , VT, Expand);
       setOperationAction(ISD::FCOS   , VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
 
     // Lower this to MOVMSK plus an AND.
     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 
   } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, &X86::FR32RegClass);
     if (UseX87)
       addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 
     // Use ANDPS to simulate FABS.
     setOperationAction(ISD::FABS , MVT::f32, Custom);
 
     // Use XORP to simulate FNEG.
     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 
     if (UseX87)
       setOperationAction(ISD::UNDEF, MVT::f64, Expand);
 
     // Use ANDPS and ORPS to simulate FCOPYSIGN.
     if (UseX87)
       setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
     // We don't support sin/cos/fmod
     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
     if (UseX87) {
       // Always expand sin/cos functions even though x87 has an instruction.
       setOperationAction(ISD::FSIN, MVT::f64, Expand);
       setOperationAction(ISD::FCOS, MVT::f64, Expand);
       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
     }
   } else if (UseX87) {
     // f32 and f64 in x87.
     // Set up the FP register classes.
     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 
     for (auto VT : { MVT::f32, MVT::f64 }) {
       setOperationAction(ISD::UNDEF,     VT, Expand);
       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 
       // Always expand sin/cos functions even though x87 has an instruction.
       setOperationAction(ISD::FSIN   , VT, Expand);
       setOperationAction(ISD::FCOS   , VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
     }
   }
 
   // Expand FP32 immediates into loads from the stack, save special cases.
   if (isTypeLegal(MVT::f32)) {
     if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
       addLegalFPImmediate(APFloat(+0.0f)); // FLD0
       addLegalFPImmediate(APFloat(+1.0f)); // FLD1
       addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
       addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
     } else // SSE immediates.
       addLegalFPImmediate(APFloat(+0.0f)); // xorps
   }
   // Expand FP64 immediates into loads from the stack, save special cases.
   if (isTypeLegal(MVT::f64)) {
     if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
       addLegalFPImmediate(APFloat(+0.0)); // FLD0
       addLegalFPImmediate(APFloat(+1.0)); // FLD1
       addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
       addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
     } else // SSE immediates.
       addLegalFPImmediate(APFloat(+0.0)); // xorpd
   }
 
   // We don't support FMA.
   setOperationAction(ISD::FMA, MVT::f64, Expand);
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
   // Long double always uses X87, except f128 in MMX.
   if (UseX87) {
     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
       addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                      : &X86::VR128RegClass);
       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
       setOperationAction(ISD::FABS , MVT::f128, Custom);
       setOperationAction(ISD::FNEG , MVT::f128, Custom);
       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
     }
 
     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
     {
       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
       addLegalFPImmediate(TmpFlt);  // FLD0
       TmpFlt.changeSign();
       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 
       bool ignored;
       APFloat TmpFlt2(+1.0);
       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
                       &ignored);
       addLegalFPImmediate(TmpFlt2);  // FLD1
       TmpFlt2.changeSign();
       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
     }
 
     // Always expand sin/cos functions even though x87 has an instruction.
     setOperationAction(ISD::FSIN   , MVT::f80, Expand);
     setOperationAction(ISD::FCOS   , MVT::f80, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 
     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
     setOperationAction(ISD::FMA, MVT::f80, Expand);
   }
 
   // Always use a library call for pow.
   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 
   setOperationAction(ISD::FLOG, MVT::f80, Expand);
   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
   setOperationAction(ISD::FEXP, MVT::f80, Expand);
   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 
   // Some FP actions are always expanded for vector types.
   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
     setOperationAction(ISD::FSIN,      VT, Expand);
     setOperationAction(ISD::FSINCOS,   VT, Expand);
     setOperationAction(ISD::FCOS,      VT, Expand);
     setOperationAction(ISD::FREM,      VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::FPOW,      VT, Expand);
     setOperationAction(ISD::FLOG,      VT, Expand);
     setOperationAction(ISD::FLOG2,     VT, Expand);
     setOperationAction(ISD::FLOG10,    VT, Expand);
     setOperationAction(ISD::FEXP,      VT, Expand);
     setOperationAction(ISD::FEXP2,     VT, Expand);
   }
 
   // First set operation action for all vector types to either promote
   // (for widening) or expand (for scalarization). Then we will selectively
   // turn on ones that can be effectively codegen'd.
   for (MVT VT : MVT::vector_valuetypes()) {
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UDIV, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
     setOperationAction(ISD::FMA,  VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
     setOperationAction(ISD::FCEIL, VT, Expand);
     setOperationAction(ISD::FTRUNC, VT, Expand);
     setOperationAction(ISD::FRINT, VT, Expand);
     setOperationAction(ISD::FNEARBYINT, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Expand);
     setOperationAction(ISD::CTTZ, VT, Expand);
     setOperationAction(ISD::CTLZ, VT, Expand);
     setOperationAction(ISD::ROTL, VT, Expand);
     setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::BSWAP, VT, Expand);
     setOperationAction(ISD::SETCC, VT, Expand);
     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
     setOperationAction(ISD::TRUNCATE, VT, Expand);
     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
     setOperationAction(ISD::SELECT_CC, VT, Expand);
     for (MVT InnerVT : MVT::vector_valuetypes()) {
       setTruncStoreAction(InnerVT, VT, Expand);
 
       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 
       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
       // types, we have to deal with them whether we ask for Expansion or not.
       // Setting Expand causes its own optimisation problems though, so leave
       // them legal.
       if (VT.getVectorElementType() == MVT::i1)
         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 
       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
       // split/scalarized right now.
       if (VT.getVectorElementType() == MVT::f16)
         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
     }
   }
 
   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
   // with -msoft-float, disable use of MMX as well.
   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
     // No operations on x86mmx supported, everything uses intrinsics.
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
     // registers cannot be used even for integer operations.
     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
 
     for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
                      MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
       setOperationAction(ISD::SDIV, VT, Custom);
       setOperationAction(ISD::SREM, VT, Custom);
       setOperationAction(ISD::UDIV, VT, Custom);
       setOperationAction(ISD::UREM, VT, Custom);
     }
 
     setOperationAction(ISD::MUL,                MVT::v2i8,  Custom);
     setOperationAction(ISD::MUL,                MVT::v2i16, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i8,  Custom);
     setOperationAction(ISD::MUL,                MVT::v4i16, Custom);
     setOperationAction(ISD::MUL,                MVT::v8i8,  Custom);
 
     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
     setOperationAction(ISD::MULHU,              MVT::v4i32, Custom);
     setOperationAction(ISD::MULHS,              MVT::v4i32, Custom);
     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
     }
 
     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::SSUBSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::UADDSAT,            MVT::v8i16, Legal);
     setOperationAction(ISD::SADDSAT,            MVT::v8i16, Legal);
     setOperationAction(ISD::USUBSAT,            MVT::v8i16, Legal);
     setOperationAction(ISD::SSUBSAT,            MVT::v8i16, Legal);
 
     if (!ExperimentalVectorWideningLegalization) {
       // Use widening instead of promotion.
       for (auto VT : { MVT::v8i8, MVT::v4i8, MVT::v2i8,
                        MVT::v4i16, MVT::v2i16 }) {
         setOperationAction(ISD::UADDSAT, VT, Custom);
         setOperationAction(ISD::SADDSAT, VT, Custom);
         setOperationAction(ISD::USUBSAT, VT, Custom);
         setOperationAction(ISD::SSUBSAT, VT, Custom);
       }
     }
 
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
     // Provide custom widening for v2f32 setcc. This is really for VLX when
     // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
     // type legalization changing the result type to v4i1 during widening.
     // It works fine for SSE2 and is probably faster so no need to qualify with
     // VLX support.
     setOperationAction(ISD::SETCC,               MVT::v2i32, Custom);
 
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::CTPOP,              VT, Custom);
       setOperationAction(ISD::ABS,                VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
       setCondCodeAction(ISD::SETLT, VT, Custom);
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
     // We support custom legalizing of sext and anyext loads for specific
     // memory vector types which we can load as a scalar (or sequence of
     // scalars) and extend in-register to a legal 128-bit vector type. For sext
     // loads these must work with a single scalar load.
     for (MVT VT : MVT::integer_vector_valuetypes()) {
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
     }
 
     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
 
       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
         continue;
 
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
     // Custom lower v2i64 and v2f64 selects.
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i16, Custom);
 
     // Custom legalize these to avoid over promotion or custom promotion.
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i8,  Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i8,  Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i8,  Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i16, Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i16, Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i8,  Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i8,  Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i8,  Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i16, Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i16, Custom);
 
     // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
     // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
     // split again based on the input type, this will cause an AssertSExt i16 to
     // be emitted instead of an AssertZExt. This will allow packssdw followed by
     // packuswb to be used to truncate to v8i8. This is necessary since packusdw
     // isn't available until sse4.1.
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
 
     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
 
     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
 
     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 
     // We want to legalize this to an f64 load rather than an i64 load on
     // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
     // store.
     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
     setOperationAction(ISD::LOAD,               MVT::v2i32, Custom);
     setOperationAction(ISD::LOAD,               MVT::v4i16, Custom);
     setOperationAction(ISD::LOAD,               MVT::v8i8,  Custom);
     setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
     setOperationAction(ISD::STORE,              MVT::v2i32, Custom);
     setOperationAction(ISD::STORE,              MVT::v4i16, Custom);
     setOperationAction(ISD::STORE,              MVT::v8i8,  Custom);
 
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
     if (!Subtarget.hasAVX512())
       setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 
     if (ExperimentalVectorWideningLegalization) {
       setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
 
       setOperationAction(ISD::TRUNCATE,    MVT::v2i8,  Custom);
       setOperationAction(ISD::TRUNCATE,    MVT::v2i16, Custom);
       setOperationAction(ISD::TRUNCATE,    MVT::v2i32, Custom);
       setOperationAction(ISD::TRUNCATE,    MVT::v4i8,  Custom);
       setOperationAction(ISD::TRUNCATE,    MVT::v4i16, Custom);
       setOperationAction(ISD::TRUNCATE,    MVT::v8i8,  Custom);
     } else {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
     }
 
     // In the customized shift lowering, the legal v4i32/v2i64 cases
     // in AVX2 will be recognized.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SRL,              VT, Custom);
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
     }
 
     setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
     setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
 
     // With AVX512, expanding (and promoting the shifts) is better.
     if (!Subtarget.hasAVX512())
       setOperationAction(ISD::ROTL,             MVT::v16i8, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
 
     // These might be better off as horizontal vector ops.
     setOperationAction(ISD::ADD,                MVT::i16, Custom);
     setOperationAction(ISD::ADD,                MVT::i32, Custom);
     setOperationAction(ISD::SUB,                MVT::i16, Custom);
     setOperationAction(ISD::SUB,                MVT::i32, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
     }
 
     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
     // We directly match byte blends in the backend as they match the VSELECT
     // condition form.
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 
     // SSE41 brings specific instructions for doing vector sign extend even in
     // cases where we don't have SRA.
     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
     }
 
     if (!ExperimentalVectorWideningLegalization) {
       // Avoid narrow result types when widening. The legal types are listed
       // in the next loop.
       for (MVT VT : MVT::integer_vector_valuetypes()) {
         setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
         setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
         setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
       }
     }
 
     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
       if (!ExperimentalVectorWideningLegalization)
         setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
     }
 
     // i8 vectors are custom because the source register and source
     // source memory operand types are not the same width.
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
       setOperationAction(ISD::ROTL, VT, Custom);
 
     // XOP can efficiently perform BITREVERSE with VPPERM.
     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
       setOperationAction(ISD::BITREVERSE, VT, Custom);
 
     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
       setOperationAction(ISD::BITREVERSE, VT, Custom);
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
     bool HasInt256 = Subtarget.hasInt256();
 
     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
 
     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
       setOperationAction(ISD::FFLOOR,     VT, Legal);
       setOperationAction(ISD::FCEIL,      VT, Legal);
       setOperationAction(ISD::FTRUNC,     VT, Legal);
       setOperationAction(ISD::FRINT,      VT, Legal);
       setOperationAction(ISD::FNEARBYINT, VT, Legal);
       setOperationAction(ISD::FNEG,       VT, Custom);
       setOperationAction(ISD::FABS,       VT, Custom);
       setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
     }
 
     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
     // even though v8i16 is a legal type.
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
 
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
     if (!Subtarget.hasAVX512())
       setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
 
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
 
     // In the customized shift lowering, the legal v8i32/v4i64 cases
     // in AVX2 will be recognized.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::SHL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
     }
 
     if (ExperimentalVectorWideningLegalization) {
       // These types need custom splitting if their input is a 128-bit vector.
       setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
       setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
       setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i64,  Custom);
       setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i32, Custom);
     }
 
     setOperationAction(ISD::ROTL,              MVT::v8i32,  Custom);
     setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
 
     // With BWI, expanding (and promoting the shifts) is the better.
     if (!Subtarget.hasBWI())
       setOperationAction(ISD::ROTL,            MVT::v32i8,  Custom);
 
     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
     setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
     }
 
     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::SETCC,           VT, Custom);
       setOperationAction(ISD::CTPOP,           VT, Custom);
       setOperationAction(ISD::CTLZ,            VT, Custom);
 
       // TODO - remove this once 256-bit X86ISD::ANDNP correctly split.
       setOperationAction(ISD::CTTZ,  VT, HasInt256 ? Expand : Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
       setCondCodeAction(ISD::SETLT, VT, Custom);
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
     if (Subtarget.hasAnyFMA()) {
       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
                        MVT::v2f64, MVT::v4f64 })
         setOperationAction(ISD::FMA, VT, Legal);
     }
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
     }
 
     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
 
     setOperationAction(ISD::MULHU,     MVT::v8i32,  Custom);
     setOperationAction(ISD::MULHS,     MVT::v8i32,  Custom);
     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
 
     setOperationAction(ISD::ABS,       MVT::v4i64,  Custom);
     setOperationAction(ISD::SMAX,      MVT::v4i64,  Custom);
     setOperationAction(ISD::UMAX,      MVT::v4i64,  Custom);
     setOperationAction(ISD::SMIN,      MVT::v4i64,  Custom);
     setOperationAction(ISD::UMIN,      MVT::v4i64,  Custom);
 
     setOperationAction(ISD::UADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
     setOperationAction(ISD::SADDSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
     setOperationAction(ISD::USUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
     setOperationAction(ISD::SSUBSAT,   MVT::v32i8,  HasInt256 ? Legal : Custom);
     setOperationAction(ISD::UADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::SADDSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::USUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
     setOperationAction(ISD::SSUBSAT,   MVT::v16i16, HasInt256 ? Legal : Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
     }
 
     for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
     }
 
     if (HasInt256) {
       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
       // when we have a 256bit-wide blend with immediate.
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
 
       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
       }
     }
 
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
       setOperationAction(ISD::MLOAD,  VT, Legal);
       setOperationAction(ISD::MSTORE, VT, Legal);
     }
 
     // Extract subvector is special because the value type
     // (result) is 128-bit but the source is 256-bit wide.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
                      MVT::v4f32, MVT::v2f64 }) {
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
     }
 
     // Custom lower several nodes for 256-bit types.
     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
                     MVT::v8f32, MVT::v4f64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
     if (HasInt256)
       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
 
     if (HasInt256) {
       // Custom legalize 2x32 to get a little better code.
       setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
       setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
 
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                        MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
         setOperationAction(ISD::MGATHER,  VT, Custom);
     }
   }
 
   // This block controls legalization of the mask vector sizes that are
   // available with AVX512. 512-bit vectors are in a separate block controlled
   // by useAVX512Regs.
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
 
     setOperationAction(ISD::SELECT,             MVT::v1i1, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
 
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1,  MVT::v8i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1,  MVT::v8i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1,  MVT::v4i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1,  MVT::v4i32);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i1,  Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i1,  Custom);
 
     // There is no byte sized k-register load or store without AVX512DQ.
     if (!Subtarget.hasDQI()) {
       setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
       setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
       setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
       setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
 
       setOperationAction(ISD::STORE, MVT::v1i1, Custom);
       setOperationAction(ISD::STORE, MVT::v2i1, Custom);
       setOperationAction(ISD::STORE, MVT::v4i1, Custom);
       setOperationAction(ISD::STORE, MVT::v8i1, Custom);
     }
 
     // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
       setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
     }
 
     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
       setOperationAction(ISD::ADD,              VT, Custom);
       setOperationAction(ISD::SUB,              VT, Custom);
       setOperationAction(ISD::MUL,              VT, Custom);
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::SELECT,           VT, Custom);
       setOperationAction(ISD::TRUNCATE,         VT, Custom);
       setOperationAction(ISD::UADDSAT,          VT, Custom);
       setOperationAction(ISD::SADDSAT,          VT, Custom);
       setOperationAction(ISD::USUBSAT,          VT, Custom);
       setOperationAction(ISD::SSUBSAT,          VT, Custom);
 
       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
       setOperationAction(ISD::VSELECT,          VT,  Expand);
     }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1,  Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v2i1,  Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1,  Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1,  Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   }
 
   // This block controls legalization for 512-bit operations with 32/64 bit
   // elements. 512-bits can be disabled based on prefer-vector-width and
   // required-vector-width function attributes.
   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
 
     for (MVT VT : MVT::fp_vector_valuetypes())
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
 
     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
     }
 
     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FNEG,  VT, Custom);
       setOperationAction(ISD::FABS,  VT, Custom);
       setOperationAction(ISD::FMA,   VT, Legal);
       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
     }
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
     setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
 
     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
 
     if (!Subtarget.hasVLX()) {
       // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
       // to 512-bit rather than use the AVX2 instructions so that we can use
       // k-masks.
       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
         setOperationAction(ISD::MLOAD,  VT, Custom);
         setOperationAction(ISD::MSTORE, VT, Custom);
       }
     }
 
     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
 
     if (ExperimentalVectorWideningLegalization) {
       // Need to custom widen this if we don't have AVX512BW.
       setOperationAction(ISD::ANY_EXTEND,         MVT::v8i8, Custom);
       setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i8, Custom);
       setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i8, Custom);
     }
 
     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FFLOOR,           VT, Legal);
       setOperationAction(ISD::FCEIL,            VT, Legal);
       setOperationAction(ISD::FTRUNC,           VT, Legal);
       setOperationAction(ISD::FRINT,            VT, Legal);
       setOperationAction(ISD::FNEARBYINT,       VT, Legal);
     }
 
     // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
     for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
     }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
 
     setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
 
     setOperationAction(ISD::MULHU,              MVT::v16i32,  Custom);
     setOperationAction(ISD::MULHS,              MVT::v16i32,  Custom);
 
     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16i32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v32i16, Custom);
     setOperationAction(ISD::SELECT,             MVT::v64i8, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
 
     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
       setOperationAction(ISD::SMAX,             VT, Legal);
       setOperationAction(ISD::UMAX,             VT, Legal);
       setOperationAction(ISD::SMIN,             VT, Legal);
       setOperationAction(ISD::UMIN,             VT, Legal);
       setOperationAction(ISD::ABS,              VT, Legal);
       setOperationAction(ISD::SRL,              VT, Custom);
       setOperationAction(ISD::SHL,              VT, Custom);
       setOperationAction(ISD::SRA,              VT, Custom);
       setOperationAction(ISD::CTPOP,            VT, Custom);
       setOperationAction(ISD::ROTL,             VT, Custom);
       setOperationAction(ISD::ROTR,             VT, Custom);
       setOperationAction(ISD::SETCC,            VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
       setCondCodeAction(ISD::SETLT, VT, Custom);
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
     if (Subtarget.hasDQI()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
       setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
       setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
 
       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
     }
 
     if (Subtarget.hasCDI()) {
       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
       for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
         setOperationAction(ISD::CTLZ,            VT, Legal);
       }
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasVPOPCNTDQ()) {
       for (auto VT : { MVT::v16i32, MVT::v8i64 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
 
     // Extract subvector is special because the value type
     // (result) is 256-bit but the source is 512-bit wide.
     // 128-bit was made Legal under AVX1.
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
                      MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 
     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
       setOperationAction(ISD::VSELECT,             VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
       setOperationAction(ISD::MLOAD,               VT, Legal);
       setOperationAction(ISD::MSTORE,              VT, Legal);
       setOperationAction(ISD::MGATHER,             VT, Custom);
       setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
     // Need to custom split v32i16/v64i8 bitcasts.
     if (!Subtarget.hasBWI()) {
       setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
       setOperationAction(ISD::BITCAST, MVT::v64i8,  Custom);
     }
 
     if (Subtarget.hasVBMI2()) {
       for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
         setOperationAction(ISD::FSHL, VT, Custom);
         setOperationAction(ISD::FSHR, VT, Custom);
       }
     }
   }// has  AVX-512
 
   // This block controls legalization for operations that don't have
   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
   // narrower widths.
   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
     // These operations are handled on non-VLX by artificially widening in
     // isel patterns.
     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
 
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
 
     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
       setOperationAction(ISD::SMAX, VT, Legal);
       setOperationAction(ISD::UMAX, VT, Legal);
       setOperationAction(ISD::SMIN, VT, Legal);
       setOperationAction(ISD::UMIN, VT, Legal);
       setOperationAction(ISD::ABS,  VT, Legal);
     }
 
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
       setOperationAction(ISD::ROTL,     VT, Custom);
       setOperationAction(ISD::ROTR,     VT, Custom);
     }
 
     // Custom legalize 2x32 to get a little better code.
     setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
     setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
 
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
       setOperationAction(ISD::MSCATTER, VT, Custom);
 
     if (Subtarget.hasDQI()) {
       for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
         setOperationAction(ISD::SINT_TO_FP,     VT, Legal);
         setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
         setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
         setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
 
         setOperationAction(ISD::MUL,            VT, Legal);
       }
     }
 
     if (Subtarget.hasCDI()) {
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
         setOperationAction(ISD::CTLZ,            VT, Legal);
       }
     } // Subtarget.hasCDI()
 
     if (Subtarget.hasVPOPCNTDQ()) {
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
   }
 
   // This block control legalization of v32i1/v64i1 which are available with
   // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
   // useBWIRegs.
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
 
     for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
       setOperationAction(ISD::ADD,                VT, Custom);
       setOperationAction(ISD::SUB,                VT, Custom);
       setOperationAction(ISD::MUL,                VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Expand);
       setOperationAction(ISD::UADDSAT,            VT, Custom);
       setOperationAction(ISD::SADDSAT,            VT, Custom);
       setOperationAction(ISD::USUBSAT,            VT, Custom);
       setOperationAction(ISD::SSUBSAT,            VT, Custom);
 
       setOperationAction(ISD::TRUNCATE,           VT, Custom);
       setOperationAction(ISD::SETCC,              VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SELECT,             VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
     }
 
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
     for (auto VT : { MVT::v16i1, MVT::v32i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
     // Extends from v32i1 masks to 256-bit vectors.
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
   }
 
   // This block controls legalization for v32i16 and v64i8. 512-bits can be
   // disabled based on prefer-vector-width and required-vector-width function
   // attributes.
   if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
 
     // Extends from v64i1 masks to 512-bit vectors.
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
 
     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
     setOperationAction(ISD::MULHS,              MVT::v64i8, Custom);
     setOperationAction(ISD::MULHU,              MVT::v64i8, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
 
     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
 
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VSELECT,      VT, Custom);
       setOperationAction(ISD::ABS,          VT, Legal);
       setOperationAction(ISD::SRL,          VT, Custom);
       setOperationAction(ISD::SHL,          VT, Custom);
       setOperationAction(ISD::SRA,          VT, Custom);
       setOperationAction(ISD::MLOAD,        VT, Legal);
       setOperationAction(ISD::MSTORE,       VT, Legal);
       setOperationAction(ISD::CTPOP,        VT, Custom);
       setOperationAction(ISD::CTLZ,         VT, Custom);
       setOperationAction(ISD::SMAX,         VT, Legal);
       setOperationAction(ISD::UMAX,         VT, Legal);
       setOperationAction(ISD::SMIN,         VT, Legal);
       setOperationAction(ISD::UMIN,         VT, Legal);
       setOperationAction(ISD::SETCC,        VT, Custom);
       setOperationAction(ISD::UADDSAT,      VT, Legal);
       setOperationAction(ISD::SADDSAT,      VT, Legal);
       setOperationAction(ISD::USUBSAT,      VT, Legal);
       setOperationAction(ISD::SSUBSAT,      VT, Legal);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
       setCondCodeAction(ISD::SETLT, VT, Custom);
       setCondCodeAction(ISD::SETLE, VT, Custom);
     }
 
     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
     }
 
     if (Subtarget.hasBITALG()) {
       for (auto VT : { MVT::v64i8, MVT::v32i16 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
 
     if (Subtarget.hasVBMI2()) {
       setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
       setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
     }
 
     // These operations are handled on non-VLX by artificially widening in
     // isel patterns.
     // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
 
     if (Subtarget.hasBITALG()) {
       for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
     setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
     setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
 
     setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
     setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
     setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
 
     if (Subtarget.hasDQI()) {
       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
       // v2f32 UINT_TO_FP is already custom under SSE2.
       setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
       assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
              "Unexpected operation action!");
       // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
       setOperationAction(ISD::FP_TO_SINT,    MVT::v2f32, Custom);
       setOperationAction(ISD::FP_TO_UINT,    MVT::v2f32, Custom);
     }
 
     if (Subtarget.hasBWI()) {
       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
     }
 
     if (Subtarget.hasVBMI2()) {
       // TODO: Make these legal even without VLX?
       for (auto VT : { MVT::v8i16,  MVT::v4i32, MVT::v2i64,
                        MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
         setOperationAction(ISD::FSHL, VT, Custom);
         setOperationAction(ISD::FSHR, VT, Custom);
       }
     }
   }
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   if (!Subtarget.is64Bit()) {
     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   }
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
   //
   // FIXME: We really should do custom legalization for addition and
   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
   // than generic legalization for 64-bit multiplication-with-overflow, though.
   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
     // Add/Sub/Mul with overflow operations are custom lowered.
     setOperationAction(ISD::SADDO, VT, Custom);
     setOperationAction(ISD::UADDO, VT, Custom);
     setOperationAction(ISD::SSUBO, VT, Custom);
     setOperationAction(ISD::USUBO, VT, Custom);
     setOperationAction(ISD::SMULO, VT, Custom);
     setOperationAction(ISD::UMULO, VT, Custom);
 
     // Support carry in as value rather than glue.
     setOperationAction(ISD::ADDCARRY, VT, Custom);
     setOperationAction(ISD::SUBCARRY, VT, Custom);
     setOperationAction(ISD::SETCCCARRY, VT, Custom);
   }
 
   if (!Subtarget.is64Bit()) {
     // These libcalls are not available in 32-bit.
     setLibcallName(RTLIB::SHL_I128, nullptr);
     setLibcallName(RTLIB::SRL_I128, nullptr);
     setLibcallName(RTLIB::SRA_I128, nullptr);
     setLibcallName(RTLIB::MUL_I128, nullptr);
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   }
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
     setOperationAction(ISD::UDIV, MVT::i128, Custom);
     setOperationAction(ISD::SREM, MVT::i128, Custom);
     setOperationAction(ISD::UREM, MVT::i128, Custom);
     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
   }
 
   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
   // is. We should promote the value to 64-bits to solve this.
   // This is what the CRT headers do - `fmodf` is an inline header
   // function casting to f64 and calling `fmod`.
   if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
                               Subtarget.isTargetWindowsItanium()))
     for (ISD::NodeType Op :
          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
       if (isOperationExpand(Op, MVT::f32))
         setOperationAction(Op, MVT::f32, Promote);
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
   setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
   setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::MLOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::MSTORE);
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::MSCATTER);
   setTargetDAGCombine(ISD::MGATHER);
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
 
   // TODO: These control memcmp expansion in CGP and could be raised higher, but
   // that needs to benchmarked and balanced with the potential use of vector
   // load/store types (PR33329, PR33914).
   MaxLoadsPerMemcmp = 2;
   MaxLoadsPerMemcmpOptSize = 2;
 
   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
   // An out-of-order CPU can speculatively execute past a predictable branch,
   // but a conditional move could be stalled by an expensive earlier operation.
   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
   EnableExtLdPromotion = true;
   setPrefFunctionAlignment(4); // 2^4 bytes.
 
   verifyIntrinsicTables();
 }
 
 // This has so far only been implemented for 64-bit MachO.
 bool X86TargetLowering::useLoadStackGuardNode() const {
   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
 }
 
 bool X86TargetLowering::useStackGuardXorFP() const {
   // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
   return Subtarget.getTargetTriple().isOSMSVCRT();
 }
 
 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
                                                const SDLoc &DL) const {
   EVT PtrTy = getPointerTy(DAG.getDataLayout());
   unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
   MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
   return SDValue(Node, 0);
 }
 
 TargetLoweringBase::LegalizeTypeAction
 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return TypeSplitVector;
 
   if (ExperimentalVectorWideningLegalization &&
       VT.getVectorNumElements() != 1 &&
       VT.getVectorElementType() != MVT::i1)
     return TypeWidenVector;
 
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                      CallingConv::ID CC,
                                                      EVT VT) const {
   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return MVT::v32i8;
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
 }
 
 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                           CallingConv::ID CC,
                                                           EVT VT) const {
   if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
     return 1;
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
 
 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
                                           LLVMContext& Context,
                                           EVT VT) const {
   if (!VT.isVector())
     return MVT::i8;
 
   if (Subtarget.hasAVX512()) {
     const unsigned NumElts = VT.getVectorNumElements();
 
     // Figure out what this type will be legalized to.
     EVT LegalVT = VT;
     while (getTypeAction(Context, LegalVT) != TypeLegal)
       LegalVT = getTypeToTransformTo(Context, LegalVT);
 
     // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
     if (LegalVT.getSimpleVT().is512BitVector())
       return EVT::getVectorVT(Context, MVT::i1, NumElts);
 
     if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
       // If we legalized to less than a 512-bit vector, then we will use a vXi1
       // compare for vXi32/vXi64 for sure. If we have BWI we will also support
       // vXi16/vXi8.
       MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
       if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
         return EVT::getVectorVT(Context, MVT::i1, NumElts);
     }
   }
 
   return VT.changeVectorElementTypeToInteger();
 }
 
 /// Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   if (MaxAlign == 16)
     return;
   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
     if (VTy->getBitWidth() == 128)
       MaxAlign = 16;
   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     unsigned EltAlign = 0;
     getMaxByValAlign(ATy->getElementType(), EltAlign);
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
     for (auto *EltTy : STy->elements()) {
       unsigned EltAlign = 0;
       getMaxByValAlign(EltTy, EltAlign);
       if (EltAlign > MaxAlign)
         MaxAlign = EltAlign;
       if (MaxAlign == 16)
         break;
     }
   }
 }
 
 /// Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area. For X86, aggregates
 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
 /// are at 4-byte boundaries.
 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
                                                   const DataLayout &DL) const {
   if (Subtarget.is64Bit()) {
     // Max of 8 and alignment of type.
     unsigned TyAlign = DL.getABITypeAlignment(Ty);
     if (TyAlign > 8)
       return TyAlign;
     return 8;
   }
 
   unsigned Align = 4;
   if (Subtarget.hasSSE1())
     getMaxByValAlign(Ty, Align);
   return Align;
 }
 
 /// Returns the target specific optimal type for load
 /// and store operations as a result of memset, memcpy, and memmove
 /// lowering. If DstAlign is zero that means it's safe to destination
 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
 /// means there isn't a need to check it against alignment requirement,
 /// probably because the source does not need to be loaded. If 'IsMemset' is
 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
 /// source is constant so it does not need to be loaded.
 /// It returns EVT::Other if the type should be determined using generic
 /// target-independent logic.
 EVT
 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                        unsigned DstAlign, unsigned SrcAlign,
                                        bool IsMemset, bool ZeroMemset,
                                        bool MemcpyStrSrc,
                                        MachineFunction &MF) const {
   const Function &F = MF.getFunction();
   if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
     if (Size >= 16 &&
         (!Subtarget.isUnalignedMem16Slow() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
           (SrcAlign == 0 || SrcAlign >= 16)))) {
       // FIXME: Check if unaligned 32-byte accesses are slow.
       if (Size >= 32 && Subtarget.hasAVX()) {
         // Although this isn't a well-supported type for AVX1, we'll let
         // legalization and shuffle lowering produce the optimal codegen. If we
         // choose an optimal type with a vector element larger than a byte,
         // getMemsetStores() may create an intermediate splat (using an integer
         // multiply) before we splat as a vector.
         return MVT::v32i8;
       }
       if (Subtarget.hasSSE2())
         return MVT::v16i8;
       // TODO: Can SSE1 handle a byte vector?
       // If we have SSE1 registers we should be able to use them.
       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
         return MVT::v4f32;
     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
       // Do not use f64 to lower memcpy if source is string constant. It's
       // better to use i32 to avoid the loads.
       // Also, do not use f64 to lower memset unless this is a memset of zeros.
       // The gymnastics of splatting a byte value into an XMM register and then
       // only using 8-byte stores (because this is a CPU with slow unaligned
       // 16-byte accesses) makes that a loser.
       return MVT::f64;
     }
   }
   // This is a compromise. If we reach here, unaligned accesses may be slow on
   // this target. However, creating smaller, aligned accesses could be even
   // slower and would certainly be a lot more code.
   if (Subtarget.is64Bit() && Size >= 8)
     return MVT::i64;
   return MVT::i32;
 }
 
 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
   if (VT == MVT::f32)
     return X86ScalarSSEf32;
   else if (VT == MVT::f64)
     return X86ScalarSSEf64;
   return true;
 }
 
 bool
 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                   unsigned,
                                                   unsigned,
                                                   bool *Fast) const {
   if (Fast) {
     switch (VT.getSizeInBits()) {
     default:
       // 8-byte and under are always assumed to be fast.
       *Fast = true;
       break;
     case 128:
       *Fast = !Subtarget.isUnalignedMem16Slow();
       break;
     case 256:
       *Fast = !Subtarget.isUnalignedMem32Slow();
       break;
     // TODO: What about AVX-512 (512-bit) accesses?
     }
   }
   // Misaligned accesses of any size are always allowed.
   return true;
 }
 
 /// Return the entry encoding for a jump table in the
 /// current function.  The returned value is a member of the
 /// MachineJumpTableInfo::JTEntryKind enum.
 unsigned X86TargetLowering::getJumpTableEncoding() const {
   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
   // symbol.
   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
     return MachineJumpTableInfo::EK_Custom32;
 
   // Otherwise, use the normal jump table encoding heuristics.
   return TargetLowering::getJumpTableEncoding();
 }
 
 bool X86TargetLowering::useSoftFloat() const {
   return Subtarget.useSoftFloat();
 }
 
 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
                                               ArgListTy &Args) const {
 
   // Only relabel X86-32 for C / Stdcall CCs.
   if (Subtarget.is64Bit())
     return;
   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
     return;
   unsigned ParamRegs = 0;
   if (auto *M = MF->getFunction().getParent())
     ParamRegs = M->getNumberRegisterParameters();
 
   // Mark the first N int arguments as having reg
   for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
     Type *T = Args[Idx].Ty;
     if (T->isIntOrPtrTy())
       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
         unsigned numRegs = 1;
         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
           numRegs = 2;
         if (ParamRegs < numRegs)
           return;
         ParamRegs -= numRegs;
         Args[Idx].IsInReg = true;
       }
   }
 }
 
 const MCExpr *
 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                              const MachineBasicBlock *MBB,
                                              unsigned uid,MCContext &Ctx) const{
   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
   // entries.
   return MCSymbolRefExpr::create(MBB->getSymbol(),
                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
 }
 
 /// Returns relocation base for the given PIC jumptable.
 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                     SelectionDAG &DAG) const {
   if (!Subtarget.is64Bit())
     // This doesn't have SDLoc associated with it, but is not really the
     // same as a Register.
     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
                        getPointerTy(DAG.getDataLayout()));
   return Table;
 }
 
 /// This returns the relocation base for the given PIC jumptable,
 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
 const MCExpr *X86TargetLowering::
 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
                              MCContext &Ctx) const {
   // X86-64 uses RIP relative addressing based on the jump table label.
   if (Subtarget.isPICStyleRIPRel())
     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
 
   // Otherwise, the reference is relative to the PIC base.
   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
 }
 
 std::pair<const TargetRegisterClass *, uint8_t>
 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
                                            MVT VT) const {
   const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
     return TargetLowering::findRepresentativeClass(TRI, VT);
   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
     break;
   case MVT::x86mmx:
     RRC = &X86::VR64RegClass;
     break;
   case MVT::f32: case MVT::f64:
   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   case MVT::v4f32: case MVT::v2f64:
   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
   case MVT::v8f32: case MVT::v4f64:
   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
   case MVT::v16f32: case MVT::v8f64:
     RRC = &X86::VR128XRegClass;
     break;
   }
   return std::make_pair(RRC, Cost);
 }
 
 unsigned X86TargetLowering::getAddressSpace() const {
   if (Subtarget.is64Bit())
     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
   return 256;
 }
 
 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
 }
 
 static Constant* SegmentOffset(IRBuilder<> &IRB,
                                unsigned Offset, unsigned AddressSpace) {
   return ConstantExpr::getIntToPtr(
       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
 }
 
 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
   // tcbhead_t; use it instead of the usual global variable (see
   // sysdeps/{i386,x86_64}/nptl/tls.h)
   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
     if (Subtarget.isTargetFuchsia()) {
       // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
       return SegmentOffset(IRB, 0x10, getAddressSpace());
     } else {
       // %fs:0x28, unless we're using a Kernel code model, in which case
       // it's %gs:0x28.  gs:0x14 on i386.
       unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
       return SegmentOffset(IRB, Offset, getAddressSpace());
     }
   }
 
   return TargetLowering::getIRStackGuard(IRB);
 }
 
 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   // MSVC CRT provides functionalities for stack protection.
   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
     // MSVC CRT has a global variable holding security cookie.
     M.getOrInsertGlobal("__security_cookie",
                         Type::getInt8PtrTy(M.getContext()));
 
     // MSVC CRT has a function to validate security cookie.
     auto *SecurityCheckCookie = cast<Function>(
         M.getOrInsertFunction("__security_check_cookie",
                               Type::getVoidTy(M.getContext()),
                               Type::getInt8PtrTy(M.getContext())));
     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
     return;
   }
   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
     return;
   TargetLowering::insertSSPDeclarations(M);
 }
 
 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
   // MSVC CRT has a global variable holding security cookie.
   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
     return M.getGlobalVariable("__security_cookie");
   }
   return TargetLowering::getSDagStackGuard(M);
 }
 
 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
   // MSVC CRT has a function to validate security cookie.
   if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
       Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
     return M.getFunction("__security_check_cookie");
   }
   return TargetLowering::getSSPStackGuardCheck(M);
 }
 
 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   if (Subtarget.getTargetTriple().isOSContiki())
     return getDefaultSafeStackPointerLocation(IRB, false);
 
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
   if (Subtarget.isTargetAndroid()) {
     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
     // %gs:0x24 on i386
     unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
     return SegmentOffset(IRB, Offset, getAddressSpace());
   }
 
   // Fuchsia is similar.
   if (Subtarget.isTargetFuchsia()) {
     // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
     return SegmentOffset(IRB, 0x18, getAddressSpace());
   }
 
   return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   assert(SrcAS != DestAS && "Expected different address spaces!");
 
   return SrcAS < 256 && DestAS < 256;
 }
 
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
 #include "X86GenCallingConv.inc"
 
 bool X86TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   return ScratchRegs;
 }
 
 /// Lowers masks values (v*i1) to the local register values
 /// \returns DAG node after lowering to register type
 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
                                const SDLoc &Dl, SelectionDAG &DAG) {
   EVT ValVT = ValArg.getValueType();
 
   if (ValVT == MVT::v1i1)
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
                        DAG.getIntPtrConstant(0, Dl));
 
   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
     // Two stage lowering might be required
     // bitcast:   v8i1 -> i8 / v16i1 -> i16
     // anyextend: i8   -> i32 / i16   -> i32
     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
     if (ValLoc == MVT::i32)
       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
     return ValToCopy;
   }
 
   if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
       (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
     // One stage lowering is required
     // bitcast:   v32i1 -> i32 / v64i1 -> i64
     return DAG.getBitcast(ValLoc, ValArg);
   }
 
   return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
 }
 
 /// Breaks v64i1 value into two registers and adds the new node to the DAG
 static void Passv64i1ArgInRegs(
     const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
          "The value should reside in two registers");
 
   // Before splitting the value we cast it to i64
   Arg = DAG.getBitcast(MVT::i64, Arg);
 
   // Splitting the value into two i32 types
   SDValue Lo, Hi;
   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
                    DAG.getConstant(0, Dl, MVT::i32));
   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
                    DAG.getConstant(1, Dl, MVT::i32));
 
   // Attach the two i32 types into corresponding registers
   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
 }
 
 SDValue
 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
                                const SDLoc &dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   // In some cases we need to disable registers from the default CSR list.
   // For example, when they are used for argument passing.
   bool ShouldDisableCalleeSavedRegister =
       CallConv == CallingConv::X86_RegCall ||
       MF.getFunction().hasFnAttribute("no_caller_saved_registers");
 
   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
     report_fatal_error("X86 interrupts may not return any value");
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
   SDValue Flag;
   SmallVector<SDValue, 6> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   // Operand #1 = Bytes To Pop
   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
                    MVT::i32));
 
   // Copy the result values into the output registers.
   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
        ++I, ++OutsIndex) {
     CCValAssign &VA = RVLocs[I];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     // Add the register to the CalleeSaveDisableRegs list.
     if (ShouldDisableCalleeSavedRegister)
       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
 
     SDValue ValToCopy = OutVals[OutsIndex];
     EVT ValVT = ValToCopy.getValueType();
 
     // Promote values to the appropriate types.
     if (VA.getLocInfo() == CCValAssign::SExt)
       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
     else if (VA.getLocInfo() == CCValAssign::ZExt)
       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
     else if (VA.getLocInfo() == CCValAssign::AExt) {
       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
       else
         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
     }
     else if (VA.getLocInfo() == CCValAssign::BCvt)
       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
 
     assert(VA.getLocInfo() != CCValAssign::FPExt &&
            "Unexpected FP-extend for return value.");
 
     // If this is x86-64, and we disabled SSE, we can't return FP values,
     // or SSE or MMX vectors.
     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
         (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     } else if (ValVT == MVT::f64 &&
                (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
       // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
       // llvm-gcc has never done it right and no one has noticed, so this
       // should be OK for now.
       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
 
     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
     // the RET instruction and handled by the FP Stackifier.
     if (VA.getLocReg() == X86::FP0 ||
         VA.getLocReg() == X86::FP1) {
       // If this is a copy from an xmm register to ST(0), use an FPExtend to
       // change the value to the FP stack register class.
       if (isScalarFPTypeInSSEReg(VA.getValVT()))
         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
       RetOps.push_back(ValToCopy);
       // Don't emit a copytoreg.
       continue;
     }
 
     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
     // which is returned in RAX / RDX.
     if (Subtarget.is64Bit()) {
       if (ValVT == MVT::x86mmx) {
         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                                   ValToCopy);
           // If we don't have SSE2 available, convert to v4f32 so the generated
           // register is legal.
           if (!Subtarget.hasSSE2())
             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
         }
       }
     }
 
     SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
 
       Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
                          Subtarget);
 
       assert(2 == RegsToPass.size() &&
              "Expecting two registers after Pass64BitArgInRegs");
 
       // Add the second register to the CalleeSaveDisableRegs list.
       if (ShouldDisableCalleeSavedRegister)
         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
     } else {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
     }
 
     // Add nodes to the DAG and add the values into the RetOps list
     for (auto &Reg : RegsToPass) {
       Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
       Flag = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
     }
   }
 
   // Swift calling convention does not require we copy the sret argument
   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
 
   // All x86 ABIs require that for returning structs by value we copy
   // the sret argument into %rax/%eax (depending on ABI) for the return.
   // We saved the argument into a virtual register in the entry block,
   // so now we copy the value out and into %rax/%eax.
   //
   // Checking Function.hasStructRetAttr() here is insufficient because the IR
   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   // either case FuncInfo->setSRetReturnReg() will have been called.
   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
     // When we have both sret and another return value, we should use the
     // original Chain stored in RetOps[0], instead of the current Chain updated
     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
 
     // For the case of sret and another return value, we have
     //   Chain_0 at the function entry
     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
     // If we use Chain_1 in getCopyFromReg, we will have
     //   Val = getCopyFromReg(Chain_1)
     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
 
     // getCopyToReg(Chain_0) will be glued together with
     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
     //   Data dependency from Unit B to Unit A due to usage of Val in
     //     getCopyToReg(Chain_1, Val)
     //   Chain dependency from Unit A to Unit B
 
     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
                                      getPointerTy(MF.getDataLayout()));
 
     unsigned RetValReg
         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
           X86::RAX : X86::EAX;
     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
     Flag = Chain.getValue(1);
 
     // RAX/EAX now acts like a return value.
     RetOps.push_back(
         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
 
     // Add the returned register to the CalleeSaveDisableRegs list.
     if (ShouldDisableCalleeSavedRegister)
       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
   }
 
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   const MCPhysReg *I =
       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
   if (I) {
     for (; *I; ++I) {
       if (X86::GR64RegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
       else
         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
     }
   }
 
   RetOps[0] = Chain;  // Update chain.
 
   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
   if (CallConv == CallingConv::X86_INTR)
     opcode = X86ISD::IRET;
   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
 }
 
 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
     return false;
 
   SDValue TCChain = Chain;
   SDNode *Copy = *N->use_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
       return false;
     TCChain = Copy->getOperand(0);
   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
     return false;
 
   bool HasRet = false;
   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
        UI != UE; ++UI) {
     if (UI->getOpcode() != X86ISD::RET_FLAG)
       return false;
     // If we are returning more than one value, we can definitely
     // not make a tail call see PR19530
     if (UI->getNumOperands() > 4)
       return false;
     if (UI->getNumOperands() == 4 &&
         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
       return false;
     HasRet = true;
   }
 
   if (!HasRet)
     return false;
 
   Chain = TCChain;
   return true;
 }
 
 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
                                            ISD::NodeType ExtendKind) const {
   MVT ReturnMVT = MVT::i32;
 
   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
     // The ABI does not require i1, i8 or i16 to be extended.
     //
     // On Darwin, there is code in the wild relying on Clang's old behaviour of
     // always extending i8/i16 return values, so keep doing that for now.
     // (PR26665).
     ReturnMVT = MVT::i8;
   }
 
   EVT MinVT = getRegisterType(Context, ReturnMVT);
   return VT.bitsLT(MinVT) ? MinVT : VT;
 }
 
 /// Reads two 32 bit registers and creates a 64 bit mask value.
 /// \param VA The current 32 bit value that need to be assigned.
 /// \param NextVA The next 32 bit value that need to be assigned.
 /// \param Root The parent DAG node.
 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
 ///                        glue purposes. In the case the DAG is already using
 ///                        physical register instead of virtual, we should glue
 ///                        our new SDValue to InFlag SDvalue.
 /// \return a new SDvalue of size 64bit.
 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
                                 SDValue &Root, SelectionDAG &DAG,
                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
                                 SDValue *InFlag = nullptr) {
   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
   assert(VA.getValVT() == MVT::v64i1 &&
          "Expecting first location of 64 bit width type");
   assert(NextVA.getValVT() == VA.getValVT() &&
          "The locations should have the same type");
   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
          "The values should reside in two registers");
 
   SDValue Lo, Hi;
   unsigned Reg;
   SDValue ArgValueLo, ArgValueHi;
 
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterClass *RC = &X86::GR32RegClass;
 
   // Read a 32 bit value from the registers.
   if (nullptr == InFlag) {
     // When no physical register is present,
     // create an intermediate virtual register.
     Reg = MF.addLiveIn(VA.getLocReg(), RC);
     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
   } else {
     // When a physical register is available read the value from it and glue
     // the reads together.
     ArgValueLo =
       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
     *InFlag = ArgValueLo.getValue(2);
     ArgValueHi =
       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
     *InFlag = ArgValueHi.getValue(2);
   }
 
   // Convert the i32 type into v32i1 type.
   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
 
   // Convert the i32 type into v32i1 type.
   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
 
   // Concatenate the two values together.
   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
 }
 
 /// The function will lower a register of various sizes (8/16/32/64)
 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
 /// \returns a DAG node contains the operand after lowering to mask type.
 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
                                const EVT &ValLoc, const SDLoc &Dl,
                                SelectionDAG &DAG) {
   SDValue ValReturned = ValArg;
 
   if (ValVT == MVT::v1i1)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
 
   if (ValVT == MVT::v64i1) {
     // In 32 bit machine, this case is handled by getv64i1Argument
     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
     // In 64 bit machine, There is no need to truncate the value only bitcast
   } else {
     MVT maskLen;
     switch (ValVT.getSimpleVT().SimpleTy) {
     case MVT::v8i1:
       maskLen = MVT::i8;
       break;
     case MVT::v16i1:
       maskLen = MVT::i16;
       break;
     case MVT::v32i1:
       maskLen = MVT::i32;
       break;
     default:
       llvm_unreachable("Expecting a vector of i1 types");
     }
 
     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
   }
   return DAG.getBitcast(ValVT, ValReturned);
 }
 
 /// Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 ///
 SDValue X86TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     uint32_t *RegMask) const {
 
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget.is64Bit();
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
        ++I, ++InsIndex) {
     CCValAssign &VA = RVLocs[I];
     EVT CopyVT = VA.getLocVT();
 
     // In some calling conventions we need to remove the used registers
     // from the register mask.
     if (RegMask) {
       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
     }
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
     }
 
     // If we prefer to use the value in xmm registers, copy it out as f80 and
     // use a truncate to move it from fp stack reg to xmm reg.
     bool RoundAfterCopy = false;
     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
         isScalarFPTypeInSSEReg(VA.getValVT())) {
       if (!Subtarget.hasX87())
         report_fatal_error("X87 register return with X87 disabled");
       CopyVT = MVT::f80;
       RoundAfterCopy = (CopyVT != VA.getLocVT());
     }
 
     SDValue Val;
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
       Val =
           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
     } else {
       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
                   .getValue(1);
       Val = Chain.getValue(0);
       InFlag = Chain.getValue(2);
     }
 
     if (RoundAfterCopy)
       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
                         // This truncation won't change the value.
                         DAG.getIntPtrConstant(1, dl));
 
     if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
       if (VA.getValVT().isVector() &&
           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
       } else
         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
     }
 
     InVals.push_back(Val);
   }
 
   return Chain;
 }
 
 //===----------------------------------------------------------------------===//
 //                C & StdCall & Fast Calling Convention implementation
 //===----------------------------------------------------------------------===//
 //  StdCall calling convention seems to be standard for many Windows' API
 //  routines and around. It differs from C calling convention just a little:
 //  callee should clean up the stack, not caller. Symbols should be also
 //  decorated in some fancy way :) It doesn't support any vector arguments.
 //  For info on fast calling convention see Fast Calling Convention (tail call)
 //  implementation LowerX86_32FastCCCallTo.
 
 /// CallIsStructReturn - Determines whether a call uses struct return
 /// semantics.
 enum StructReturnType {
   NotStructReturn,
   RegStructReturn,
   StackStructReturn
 };
 static StructReturnType
 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
   if (Outs.empty())
     return NotStructReturn;
 
   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
   if (!Flags.isSRet())
     return NotStructReturn;
   if (Flags.isInReg() || IsMCU)
     return RegStructReturn;
   return StackStructReturn;
 }
 
 /// Determines whether a function uses struct return semantics.
 static StructReturnType
 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
   if (Ins.empty())
     return NotStructReturn;
 
   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
   if (!Flags.isSRet())
     return NotStructReturn;
   if (Flags.isInReg() || IsMCU)
     return RegStructReturn;
   return StackStructReturn;
 }
 
 /// Make a copy of an aggregate at address specified by "Src" to address
 /// "Dst" with size and alignment information specified by the specific
 /// parameter attribute. The copy will be passed as a byval function parameter.
 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                          SDValue Chain, ISD::ArgFlagsTy Flags,
                                          SelectionDAG &DAG, const SDLoc &dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
 
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
                        /*isVolatile*/false, /*AlwaysInline=*/true,
                        /*isTailCall*/false,
                        MachinePointerInfo(), MachinePointerInfo());
 }
 
 /// Return true if the calling convention is one that we can guarantee TCO for.
 static bool canGuaranteeTCO(CallingConv::ID CC) {
   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
           CC == CallingConv::HHVM);
 }
 
 /// Return true if we might ever do TCO for calls with this calling convention.
 static bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   // C calling conventions:
   case CallingConv::C:
   case CallingConv::Win64:
   case CallingConv::X86_64_SysV:
   // Callee pop conventions:
   case CallingConv::X86_ThisCall:
   case CallingConv::X86_StdCall:
   case CallingConv::X86_VectorCall:
   case CallingConv::X86_FastCall:
     return true;
   default:
     return canGuaranteeTCO(CC);
   }
 }
 
 /// Return true if the function is being made into a tailcall target by
 /// changing its ABI.
 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
 }
 
 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   auto Attr =
       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
     return false;
 
   ImmutableCallSite CS(CI);
   CallingConv::ID CalleeCC = CS.getCallingConv();
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
   return true;
 }
 
 SDValue
 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                     const SDLoc &dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
                                     MachineFrameInfo &MFI, unsigned i) const {
   // Create the nodes corresponding to a load from this parameter slot.
   ISD::ArgFlagsTy Flags = Ins[i].Flags;
   bool AlwaysUseMutable = shouldGuaranteeTCO(
       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
   MVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // If value is passed by pointer we have address passed instead of the value
   // itself. No need to extend if the mask value and location share the same
   // absolute size.
   bool ExtendedInMem =
       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
 
   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
     ValVT = VA.getLocVT();
   else
     ValVT = VA.getValVT();
 
   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
   // taken by a return address.
   int Offset = 0;
   if (CallConv == CallingConv::X86_INTR) {
     // X86 interrupts may take one or two arguments.
     // On the stack there will be no return address as in regular call.
     // Offset of last argument need to be set to -4/-8 bytes.
     // Where offset of the first argument out of two, should be set to 0 bytes.
     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
     if (Subtarget.is64Bit() && Ins.size() == 2) {
       // The stack pointer needs to be realigned for 64 bit handlers with error
       // code, so the argument offset changes by 8 bytes.
       Offset += 8;
     }
   }
 
   // FIXME: For now, all byval parameter objects are marked mutable. This can be
   // changed with more analysis.
   // In case of tail call optimization mark all arguments mutable. Since they
   // could be overwritten by lowering of arguments in case of a tail call.
   if (Flags.isByVal()) {
     unsigned Bytes = Flags.getByValSize();
     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
 
     // FIXME: For now, all byval parameter objects are marked as aliasing. This
     // can be improved with deeper analysis.
     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
                                    /*isAliased=*/true);
     // Adjust SP offset of interrupt parameter.
     if (CallConv == CallingConv::X86_INTR) {
       MFI.setObjectOffset(FI, Offset);
     }
     return DAG.getFrameIndex(FI, PtrVT);
   }
 
   // This is an argument in memory. We might be able to perform copy elision.
   if (Flags.isCopyElisionCandidate()) {
     EVT ArgVT = Ins[i].ArgVT;
     SDValue PartAddr;
     if (Ins[i].PartOffset == 0) {
       // If this is a one-part value or the first part of a multi-part value,
       // create a stack object for the entire argument value type and return a
       // load from our portion of it. This assumes that if the first part of an
       // argument is in memory, the rest will also be in memory.
       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
                                      /*Immutable=*/false);
       PartAddr = DAG.getFrameIndex(FI, PtrVT);
       return DAG.getLoad(
           ValVT, dl, Chain, PartAddr,
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
     } else {
       // This is not the first piece of an argument in memory. See if there is
       // already a fixed stack object including this offset. If so, assume it
       // was created by the PartOffset == 0 branch above and create a load from
       // the appropriate offset into it.
       int64_t PartBegin = VA.getLocMemOffset();
       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
       int FI = MFI.getObjectIndexBegin();
       for (; MFI.isFixedObjectIndex(FI); ++FI) {
         int64_t ObjBegin = MFI.getObjectOffset(FI);
         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
           break;
       }
       if (MFI.isFixedObjectIndex(FI)) {
         SDValue Addr =
             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
         return DAG.getLoad(
             ValVT, dl, Chain, Addr,
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
                                               Ins[i].PartOffset));
       }
     }
   }
 
   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
                                  VA.getLocMemOffset(), isImmutable);
 
   // Set SExt or ZExt flag.
   if (VA.getLocInfo() == CCValAssign::ZExt) {
     MFI.setObjectZExt(FI, true);
   } else if (VA.getLocInfo() == CCValAssign::SExt) {
     MFI.setObjectSExt(FI, true);
   }
 
   // Adjust SP offset of interrupt parameter.
   if (CallConv == CallingConv::X86_INTR) {
     MFI.setObjectOffset(FI, Offset);
   }
 
   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   SDValue Val = DAG.getLoad(
       ValVT, dl, Chain, FIN,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
   return ExtendedInMem
              ? (VA.getValVT().isVector()
                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
              : Val;
 }
 
 // FIXME: Get this from tablegen.
 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
                                                 const X86Subtarget &Subtarget) {
   assert(Subtarget.is64Bit());
 
   if (Subtarget.isCallingConvWin64(CallConv)) {
     static const MCPhysReg GPR64ArgRegsWin64[] = {
       X86::RCX, X86::RDX, X86::R8,  X86::R9
     };
     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
   }
 
   static const MCPhysReg GPR64ArgRegs64Bit[] = {
     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
   };
   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
 }
 
 // FIXME: Get this from tablegen.
 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
                                                 CallingConv::ID CallConv,
                                                 const X86Subtarget &Subtarget) {
   assert(Subtarget.is64Bit());
   if (Subtarget.isCallingConvWin64(CallConv)) {
     // The XMM registers which might contain var arg parameters are shadowed
     // in their paired GPR.  So we only need to save the GPR to their home
     // slots.
     // TODO: __vectorcall will change this.
     return None;
   }
 
   const Function &F = MF.getFunction();
   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
   bool isSoftFloat = Subtarget.useSoftFloat();
   assert(!(isSoftFloat && NoImplicitFloatOps) &&
          "SSE register cannot be used when SSE is disabled!");
   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
     // registers.
     return None;
 
   static const MCPhysReg XMMArgRegs64Bit[] = {
     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   };
   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
 }
 
 #ifndef NDEBUG
 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
                           return A.getValNo() < B.getValNo();
                         });
 }
 #endif
 
 SDValue X86TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
 
   const Function &F = MF.getFunction();
   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
       F.getName() == "main")
     FuncInfo->setForceFramePointer(true);
 
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool Is64Bit = Subtarget.is64Bit();
   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
 
   assert(
       !(isVarArg && canGuaranteeTCO(CallConv)) &&
       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
 
   if (CallConv == CallingConv::X86_INTR) {
     bool isLegal = Ins.size() == 1 ||
                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
     if (!isLegal)
       report_fatal_error("X86 interrupts may take one or two arguments");
   }
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64.
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
   CCInfo.AnalyzeArguments(Ins, CC_X86);
 
   // In vectorcall calling convention a second pass is required for the HVA
   // types.
   if (CallingConv::X86_VectorCall == CallConv) {
     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
   }
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
   assert(isSortedByValueNo(ArgLocs) &&
          "Argument Location list must be sorted before lowering");
 
   SDValue ArgValue;
   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++InsIndex) {
     assert(InsIndex < Ins.size() && "Invalid Ins index");
     CCValAssign &VA = ArgLocs[I];
 
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
       if (VA.needsCustom()) {
         assert(
             VA.getValVT() == MVT::v64i1 &&
             "Currently the only custom case is when we split v64i1 to 2 regs");
 
         // v64i1 values, in regcall calling convention, that are
         // compiled to 32 bit arch, are split up into two registers.
         ArgValue =
             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
       } else {
         const TargetRegisterClass *RC;
         if (RegVT == MVT::i8)
           RC = &X86::GR8RegClass;
         else if (RegVT == MVT::i16)
           RC = &X86::GR16RegClass;
         else if (RegVT == MVT::i32)
           RC = &X86::GR32RegClass;
         else if (Is64Bit && RegVT == MVT::i64)
           RC = &X86::GR64RegClass;
         else if (RegVT == MVT::f32)
           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
         else if (RegVT == MVT::f64)
           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
         else if (RegVT == MVT::f80)
           RC = &X86::RFP80RegClass;
         else if (RegVT == MVT::f128)
           RC = &X86::VR128RegClass;
         else if (RegVT.is512BitVector())
           RC = &X86::VR512RegClass;
         else if (RegVT.is256BitVector())
           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
         else if (RegVT.is128BitVector())
           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
         else if (RegVT == MVT::x86mmx)
           RC = &X86::VR64RegClass;
         else if (RegVT == MVT::v1i1)
           RC = &X86::VK1RegClass;
         else if (RegVT == MVT::v8i1)
           RC = &X86::VK8RegClass;
         else if (RegVT == MVT::v16i1)
           RC = &X86::VK16RegClass;
         else if (RegVT == MVT::v32i1)
           RC = &X86::VK32RegClass;
         else if (RegVT == MVT::v64i1)
           RC = &X86::VK64RegClass;
         else
           llvm_unreachable("Unknown argument type!");
 
         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       }
 
       // If this is an 8 or 16-bit value, it is really passed promoted to 32
       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
       // right size.
       if (VA.getLocInfo() == CCValAssign::SExt)
         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
                                DAG.getValueType(VA.getValVT()));
       else if (VA.getLocInfo() == CCValAssign::ZExt)
         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                                DAG.getValueType(VA.getValVT()));
       else if (VA.getLocInfo() == CCValAssign::BCvt)
         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
 
       if (VA.isExtInLoc()) {
         // Handle MMX values passed in XMM regs.
         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
         else if (VA.getValVT().isVector() &&
                  VA.getValVT().getScalarType() == MVT::i1 &&
                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
         } else
           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
       }
     } else {
       assert(VA.isMemLoc());
       ArgValue =
           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
     }
 
     // If value is passed via pointer - do a load.
     if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())
       ArgValue =
           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
 
     InVals.push_back(ArgValue);
   }
 
   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
     // Swift calling convention does not require we copy the sret argument
     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
     if (CallConv == CallingConv::Swift)
       continue;
 
     // All x86 ABIs require that for returning structs by value we copy the
     // sret argument into %rax/%eax (depending on ABI) for the return. Save
     // the argument into a virtual register so that we can access it from the
     // return points.
     if (Ins[I].Flags.isSRet()) {
       unsigned Reg = FuncInfo->getSRetReturnReg();
       if (!Reg) {
         MVT PtrTy = getPointerTy(DAG.getDataLayout());
         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
         FuncInfo->setSRetReturnReg(Reg);
       }
       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
       break;
     }
   }
 
   unsigned StackSize = CCInfo.getNextStackOffset();
   // Align stack specially for tail calls.
   if (shouldGuaranteeTCO(CallConv,
                          MF.getTarget().Options.GuaranteedTailCallOpt))
     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start. We
   // can skip this if there are no va_start calls.
   if (MFI.hasVAStart() &&
       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
                    CallConv != CallingConv::X86_ThisCall))) {
     FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
   }
 
   // Figure out if XMM registers are in use.
   assert(!(Subtarget.useSoftFloat() &&
            F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
          "SSE register cannot be used when SSE is disabled!");
 
   // 64-bit calling conventions support varargs and register parameters, so we
   // have to do extra work to spill them in the prologue.
   if (Is64Bit && isVarArg && MFI.hasVAStart()) {
     // Find the first unallocated argument registers.
     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
            "SSE register cannot be used when SSE is disabled!");
 
     // Gather all the live in physical registers.
     SmallVector<SDValue, 6> LiveGPRs;
     SmallVector<SDValue, 8> LiveXMMRegs;
     SDValue ALVal;
     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
       LiveGPRs.push_back(
           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
     }
     if (!ArgXMMs.empty()) {
       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
         LiveXMMRegs.push_back(
             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
       }
     }
 
     if (IsWin64) {
       // Get to the caller-allocated home save location.  Add 8 to account
       // for the return address.
       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
       FuncInfo->setRegSaveFrameIndex(
           MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
       // Fixup to set vararg frame on shadow area (4 x i64).
       if (NumIntRegs < 4)
         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
     } else {
       // For X86-64, if there are vararg parameters that are passed via
       // registers, then we must store them to their spots on the stack so
       // they may be loaded by dereferencing the result of va_next.
       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
       FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
     }
 
     // Store the integer parameter registers.
     SmallVector<SDValue, 8> MemOps;
     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
                                       getPointerTy(DAG.getDataLayout()));
     unsigned Offset = FuncInfo->getVarArgsGPOffset();
     for (SDValue Val : LiveGPRs) {
       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
       SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN,
                        MachinePointerInfo::getFixedStack(
                            DAG.getMachineFunction(),
                            FuncInfo->getRegSaveFrameIndex(), Offset));
       MemOps.push_back(Store);
       Offset += 8;
     }
 
     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
       // Now store the XMM (fp + vector) parameter registers.
       SmallVector<SDValue, 12> SaveXMMOps;
       SaveXMMOps.push_back(Chain);
       SaveXMMOps.push_back(ALVal);
       SaveXMMOps.push_back(DAG.getIntPtrConstant(
                              FuncInfo->getRegSaveFrameIndex(), dl));
       SaveXMMOps.push_back(DAG.getIntPtrConstant(
                              FuncInfo->getVarArgsFPOffset(), dl));
       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
                         LiveXMMRegs.end());
       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
                                    MVT::Other, SaveXMMOps));
     }
 
     if (!MemOps.empty())
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   }
 
   if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
     // Find the largest legal vector type.
     MVT VecVT = MVT::Other;
     // FIXME: Only some x86_32 calling conventions support AVX512.
     if (Subtarget.hasAVX512() &&
         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
                      CallConv == CallingConv::Intel_OCL_BI)))
       VecVT = MVT::v16f32;
     else if (Subtarget.hasAVX())
       VecVT = MVT::v8f32;
     else if (Subtarget.hasSSE2())
       VecVT = MVT::v4f32;
 
     // We forward some GPRs and some vector types.
     SmallVector<MVT, 2> RegParmTypes;
     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
     RegParmTypes.push_back(IntVT);
     if (VecVT != MVT::Other)
       RegParmTypes.push_back(VecVT);
 
     // Compute the set of forwarded registers. The rest are scratch.
     SmallVectorImpl<ForwardedRegister> &Forwards =
         FuncInfo->getForwardedMustTailRegParms();
     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
 
     // Conservatively forward AL on x86_64, since it might be used for varargs.
     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
     }
 
     // Copy all forwards from physical to virtual registers.
     for (ForwardedRegister &F : Forwards) {
       // FIXME: Can we use a less constrained schedule?
       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
     }
   }
 
   // Some CCs need callee pop.
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
     // X86 interrupts must pop the error code (and the alignment padding) if
     // present.
     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
         !Subtarget.getTargetTriple().isOSMSVCRT() &&
         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
       FuncInfo->setBytesToPopOnReturn(4);
   }
 
   if (!Is64Bit) {
     // RegSaveFrameIndex is X86-64 only.
     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
     if (CallConv == CallingConv::X86_FastCall ||
         CallConv == CallingConv::X86_ThisCall)
       // fastcc functions can't have varargs.
       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   }
 
   FuncInfo->setArgumentStackSize(StackSize);
 
   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
     EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
     if (Personality == EHPersonality::CoreCLR) {
       assert(Is64Bit);
       // TODO: Add a mechanism to frame lowering that will allow us to indicate
       // that we'd prefer this slot be allocated towards the bottom of the frame
       // (i.e. near the stack pointer after allocating the frame).  Every
       // funclet needs a copy of this slot in its (mostly empty) frame, and the
       // offset from the bottom of this and each funclet's frame must be the
       // same, so the size of funclets' (mostly empty) frames is dictated by
       // how far this slot is from the bottom (since they allocate just enough
       // space to accommodate holding this slot at the correct offset).
       int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
       EHInfo->PSPSymFrameIdx = PSPSymFI;
     }
   }
 
   if (CallConv == CallingConv::X86_RegCall ||
       F.hasFnAttribute("no_caller_saved_registers")) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
     for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
       MRI.disableCalleeSavedRegister(Pair.first);
   }
 
   return Chain;
 }
 
 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
                                             SDValue Arg, const SDLoc &dl,
                                             SelectionDAG &DAG,
                                             const CCValAssign &VA,
                                             ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                        StackPtr, PtrOff);
   if (Flags.isByVal())
     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
 
   return DAG.getStore(
       Chain, dl, Arg, PtrOff,
       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
 }
 
 /// Emit a load of return address if tail call
 /// optimization is performed and it is required.
 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
   // Adjust the Return address stack slot.
   EVT VT = getPointerTy(DAG.getDataLayout());
   OutRetAddr = getReturnAddressFrameIndex(DAG);
 
   // Load the "old" Return address.
   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
   return SDValue(OutRetAddr.getNode(), 1);
 }
 
 /// Emit a store of the return address if tail call
 /// optimization is performed and it is required (FPDiff!=0).
 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
                                         SDValue Chain, SDValue RetAddrFrIdx,
                                         EVT PtrVT, unsigned SlotSize,
                                         int FPDiff, const SDLoc &dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
   int NewReturnAddrFI =
     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
                                          false);
   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                        MachinePointerInfo::getFixedStack(
                            DAG.getMachineFunction(), NewReturnAddrFI));
   return Chain;
 }
 
 /// Returns a vector_shuffle mask for an movs{s|d}, movd
 /// operation of specified width.
 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
                        SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
   Mask.push_back(NumElems);
   for (unsigned i = 1; i != NumElems; ++i)
     Mask.push_back(i);
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 SDValue
 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
   SDLoc &dl                             = CLI.DL;
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool &isTailCall                      = CLI.IsTailCall;
   bool isVarArg                         = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget.is64Bit();
   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
   bool IsSibcall      = false;
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
   const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
   const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
   bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
                  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
   const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
   bool HasNoCfCheck =
       (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
   const Module *M = MF.getMMI().getModule();
   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
 
   if (CallConv == CallingConv::X86_INTR)
     report_fatal_error("X86 interrupts may not be called directly");
 
   if (Attr.getValueAsString() == "true")
     isTailCall = false;
 
   if (Subtarget.isPICStyleGOT() &&
       !MF.getTarget().Options.GuaranteedTailCallOpt) {
     // If we are using a GOT, disable tail calls to external symbols with
     // default visibility. Tail calling such a symbol requires using a GOT
     // relocation, which forces early binding of the symbol. This breaks code
     // that require lazy function symbol resolution. Using musttail or
     // GuaranteedTailCallOpt will override this.
     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
                G->getGlobal()->hasDefaultVisibility()))
       isTailCall = false;
   }
 
   bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
   if (IsMustTail) {
     // Force this to be a tail call.  The verifier rules are enough to ensure
     // that we can lower this successfully without moving the return address
     // around.
     isTailCall = true;
   } else if (isTailCall) {
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, SR != NotStructReturn,
                     MF.getFunction().hasStructRetAttr(), CLI.RetTy,
                     Outs, OutVals, Ins, DAG);
 
     // Sibcalls are automatically detected tailcalls which do not require
     // ABI changes.
     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
       IsSibcall = true;
 
     if (isTailCall)
       ++NumTailCalls;
   }
 
   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
          "Var args not supported with calling convention fastcc, ghc or hipe");
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64.
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
   CCInfo.AnalyzeArguments(Outs, CC_X86);
 
   // In vectorcall calling convention a second pass is required for the HVA
   // types.
   if (CallingConv::X86_VectorCall == CallConv) {
     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
   }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
   if (IsSibcall)
     // This is a sibcall. The memory operands are available in caller's
     // own caller's stack.
     NumBytes = 0;
   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
            canGuaranteeTCO(CallConv))
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
   if (isTailCall && !IsSibcall && !IsMustTail) {
     // Lower arguments at fp - stackoffset + fpdiff.
     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
 
     FPDiff = NumBytesCallerPushed - NumBytes;
 
     // Set the delta of movement of the returnaddr stackslot.
     // But only set if delta is greater than previous delta.
     if (FPDiff < X86Info->getTCReturnAddrDelta())
       X86Info->setTCReturnAddrDelta(FPDiff);
   }
 
   unsigned NumBytesToPush = NumBytes;
   unsigned NumBytesToPop = NumBytes;
 
   // If we have an inalloca argument, all stack space has already been allocated
   // for us and be right at the top of the stack.  We don't support multiple
   // arguments passed in memory when using inalloca.
   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
     NumBytesToPush = 0;
     if (!ArgLocs.back().isMemLoc())
       report_fatal_error("cannot use inalloca attribute on a register "
                          "parameter");
     if (ArgLocs.back().getLocMemOffset() != 0)
       report_fatal_error("any parameter with the inalloca attribute must be "
                          "the only memory argument");
   }
 
   if (!IsSibcall)
     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
                                  NumBytes - NumBytesToPush, dl);
 
   SDValue RetAddrFrIdx;
   // Load return address for tail calls.
   if (isTailCall && FPDiff)
     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
                                     Is64Bit, FPDiff, dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
   SDValue StackPtr;
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
   assert(isSortedByValueNo(ArgLocs) &&
          "Argument Location list must be sorted before lowering");
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++OutIndex) {
     assert(OutIndex < Outs.size() && "Invalid Out index");
     // Skip inalloca arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
     if (Flags.isInAlloca())
       continue;
 
     CCValAssign &VA = ArgLocs[I];
     EVT RegVT = VA.getLocVT();
     SDValue Arg = OutVals[OutIndex];
     bool isByVal = Flags.isByVal();
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::AExt:
       if (Arg.getValueType().isVector() &&
           Arg.getValueType().getVectorElementType() == MVT::i1)
         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
       else if (RegVT.is128BitVector()) {
         // Special case: passing MMX values in XMM registers.
         Arg = DAG.getBitcast(MVT::i64, Arg);
         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
       } else
         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getBitcast(RegVT, Arg);
       break;
     case CCValAssign::Indirect: {
       if (isByVal) {
         // Memcpy the argument to a temporary stack slot to prevent
         // the caller from seeing any modifications the callee may make
         // as guaranteed by the `byval` attribute.
         int FrameIdx = MF.getFrameInfo().CreateStackObject(
             Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
             false);
         SDValue StackSlot =
             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
         Chain =
             CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);
         // From now on treat this as a regular pointer
         Arg = StackSlot;
         isByVal = false;
       } else {
         // Store the argument.
         SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
         int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
         Chain = DAG.getStore(
             Chain, dl, Arg, SpillSlot,
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
         Arg = SpillSlot;
       }
       break;
     }
     }
 
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
       // Split v64i1 value into two registers
       Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
                          Subtarget);
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       if (isVarArg && IsWin64) {
         // Win64 ABI requires argument XMM reg to be copied to the corresponding
         // shadow reg if callee is a varargs function.
         unsigned ShadowReg = 0;
         switch (VA.getLocReg()) {
         case X86::XMM0: ShadowReg = X86::RCX; break;
         case X86::XMM1: ShadowReg = X86::RDX; break;
         case X86::XMM2: ShadowReg = X86::R8; break;
         case X86::XMM3: ShadowReg = X86::R9; break;
         }
         if (ShadowReg)
           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
       }
     } else if (!IsSibcall && (!isTailCall || isByVal)) {
       assert(VA.isMemLoc());
       if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                       getPointerTy(DAG.getDataLayout()));
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
     }
   }
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   if (Subtarget.isPICStyleGOT()) {
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
     // GOT pointer.
     if (!isTailCall) {
       RegsToPass.push_back(std::make_pair(
           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
                                           getPointerTy(DAG.getDataLayout()))));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
       // address of the callee into ECX. The value in ecx is used as target of
       // the tail jump. This is done to circumvent the ebx/callee-saved problem
       // for tail calls on PIC/GOT architectures. Normally we would just put the
       // address of GOT into ebx and then call target@PLT. But for tail calls
       // ebx would be restored (since ebx is callee saved) before jumping to the
       // target@PLT.
 
       // Note: The actual moving to ECX is done further down.
       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
       if (G && !G->getGlobal()->hasLocalLinkage() &&
           G->getGlobal()->hasDefaultVisibility())
         Callee = LowerGlobalAddress(Callee, DAG);
       else if (isa<ExternalSymbolSDNode>(Callee))
         Callee = LowerExternalSymbol(Callee, DAG);
     }
   }
 
   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
     // (prototype-less calls or calls to functions containing ellipsis (...) in
     // the declaration) %al is used as hidden argument to specify the number
     // of SSE registers used. The contents of %al do not need to match exactly
     // the number of registers, but must be an ubound on the number of SSE
     // registers used and is in the range 0 - 8 inclusive.
 
     // Count the number of XMM registers allocated.
     static const MCPhysReg XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
     assert((Subtarget.hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
 
     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
                                         DAG.getConstant(NumXMMRegs, dl,
                                                         MVT::i8)));
   }
 
   if (isVarArg && IsMustTail) {
     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
     }
   }
 
   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
   // don't need this because the eligibility check rejects calls that require
   // shuffling arguments passed in memory.
   if (!IsSibcall && isTailCall) {
     // Force all the incoming stack arguments to be loaded from the stack
     // before any new outgoing arguments are stored to the stack, because the
     // outgoing stack slots may alias the incoming argument stack slots, and
     // the alias isn't otherwise explicit. This is slightly more conservative
     // than necessary, because it means that each store effectively depends
     // on every argument instead of just those arguments it would clobber.
     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
 
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
          ++I, ++OutsIndex) {
       CCValAssign &VA = ArgLocs[I];
 
       if (VA.isRegLoc()) {
         if (VA.needsCustom()) {
           assert((CallConv == CallingConv::X86_RegCall) &&
                  "Expecting custom case only in regcall calling convention");
           // This means that we are in special case where one argument was
           // passed through two register locations - Skip the next location
           ++I;
         }
 
         continue;
       }
 
       assert(VA.isMemLoc());
       SDValue Arg = OutVals[OutsIndex];
       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
       // Skip inalloca arguments.  They don't require any work.
       if (Flags.isInAlloca())
         continue;
       // Create frame index.
       int32_t Offset = VA.getLocMemOffset()+FPDiff;
       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
 
       if (Flags.isByVal()) {
         // Copy relative to framepointer.
         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
         if (!StackPtr.getNode())
           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                         getPointerTy(DAG.getDataLayout()));
         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                              StackPtr, Source);
 
         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
                                                          ArgChain,
                                                          Flags, DAG, dl));
       } else {
         // Store relative to framepointer.
         MemOpChains2.push_back(DAG.getStore(
             ArgChain, dl, Arg, FIN,
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
       }
     }
 
     if (!MemOpChains2.empty())
       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
 
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
                                      getPointerTy(DAG.getDataLayout()),
                                      RegInfo->getSlotSize(), FPDiff, dl);
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into registers.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                              RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
     // In the 64-bit large code model, we have to make all calls
     // through a register, since the call instruction's 32-bit
     // pc-relative offset may not be large enough to hold the whole
     // address.
   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
     // If the callee is a GlobalAddress node (quite common, every direct call
     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
     // it.
     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
 
     // We should use extra load for direct calls to dllimported functions in
     // non-JIT mode.
     const GlobalValue *GV = G->getGlobal();
     if (!GV->hasDLLImportStorageClass()) {
       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
 
       Callee = DAG.getTargetGlobalAddress(
           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
 
       if (OpFlags == X86II::MO_GOTPCREL) {
         // Add a wrapper.
         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
           getPointerTy(DAG.getDataLayout()), Callee);
         // Add extra indirection
         Callee = DAG.getLoad(
             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
             MachinePointerInfo::getGOT(DAG.getMachineFunction()));
       }
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
     unsigned char OpFlags =
         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
 
     Callee = DAG.getTargetExternalSymbol(
         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
 
     if (OpFlags == X86II::MO_GOTPCREL) {
       Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
           getPointerTy(DAG.getDataLayout()), Callee);
       Callee = DAG.getLoad(
           getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
     }
   } else if (Subtarget.isTarget64BitILP32() &&
              Callee->getValueType(0) == MVT::i32) {
     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
   }
 
   // Returns a chain & a flag for retval copy to use.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SmallVector<SDValue, 8> Ops;
 
   if (!IsSibcall && isTailCall) {
     Chain = DAG.getCALLSEQ_END(Chain,
                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
   if (isTailCall)
     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
   // set X86_INTR calling convention because it has the same CSR mask
   // (same preserved registers).
   const uint32_t *Mask = RegInfo->getCallPreservedMask(
       MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
 
   // If this is an invoke in a 32-bit function using a funclet-based
   // personality, assume the function clobbers all registers. If an exception
   // is thrown, the runtime will not restore CSRs.
   // FIXME: Model this more precisely so that we can register allocate across
   // the normal edge and spill and fill across the exceptional edge.
   if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
     const Function &CallerFn = MF.getFunction();
     EHPersonality Pers =
         CallerFn.hasPersonalityFn()
             ? classifyEHPersonality(CallerFn.getPersonalityFn())
             : EHPersonality::Unknown;
     if (isFuncletEHPersonality(Pers))
       Mask = RegInfo->getNoPreservedMask();
   }
 
   // Define a new register mask from the existing mask.
   uint32_t *RegMask = nullptr;
 
   // In some calling conventions we need to remove the used physical registers
   // from the reg mask.
   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
     // Allocate a new Reg Mask and copy Mask.
     RegMask = MF.allocateRegMask();
     unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());
     memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);
 
     // Make sure all sub registers of the argument registers are reset
     // in the RegMask.
     for (auto const &RegPair : RegsToPass)
       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
 
     // Create the RegMask Operand according to our updated mask.
     Ops.push_back(DAG.getRegisterMask(RegMask));
   } else {
     // Create the RegMask Operand according to the static mask.
     Ops.push_back(DAG.getRegisterMask(Mask));
   }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
   if (isTailCall) {
     // We used to do:
     //// If this is the first return lowered for this function, add the regs
     //// to the liveout set for the function.
     // This isn't right, although it's probably harmless on x86; liveouts
     // should be computed from returns not tail calls.  Consider a void
     // function making a tail call to a function returning int.
     MF.getFrameInfo().setHasTailCall();
     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   }
 
   if (HasNoCfCheck && IsCFProtectionSupported) {
     Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
   } else {
     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   }
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
   unsigned NumBytesForCalleeToPop;
   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
                        DAG.getTarget().Options.GuaranteedTailCallOpt))
     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
            !Subtarget.getTargetTriple().isOSMSVCRT() &&
            SR == StackStructReturn)
     // If this is a call to a struct-return function, the callee
     // pops the hidden struct pointer, so we have to push it back.
     // This is common for Darwin/X86, Linux & Mingw32 targets.
     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
     NumBytesForCalleeToPop = 4;
   else
     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
 
   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
     // No need to reset the stack after the call if the call doesn't return. To
     // make the MI verify, we'll pretend the callee does it for us.
     NumBytesForCalleeToPop = NumBytes;
   }
 
   // Returns a flag for retval copy to use.
   if (!IsSibcall) {
     Chain = DAG.getCALLSEQ_END(Chain,
                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
                                                      true),
                                InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
                          InVals, RegMask);
 }
 
 //===----------------------------------------------------------------------===//
 //                Fast Calling Convention (tail call) implementation
 //===----------------------------------------------------------------------===//
 
 //  Like std call, callee cleans arguments, convention except that ECX is
 //  reserved for storing the tail called function address. Only 2 registers are
 //  free for argument passing (inreg). Tail call optimization is performed
 //  provided:
 //                * tailcallopt is enabled
 //                * caller/callee are fastcc
 //  On X86_64 architecture with GOT-style position independent code only local
 //  (within module) calls are supported at the moment.
 //  To keep the stack aligned according to platform abi the function
 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
 //  If a tail called function callee has more arguments than the caller the
 //  caller needs to make sure that there is room to move the RETADDR to. This is
 //  achieved by reserving an area the size of the argument delta right after the
 //  original RETADDR, but before the saved framepointer or the spilled registers
 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
 //  stack layout:
 //    arg1
 //    arg2
 //    RETADDR
 //    [ new RETADDR
 //      move area ]
 //    (possible EBP)
 //    ESI
 //    EDI
 //    local1 ..
 
 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
 /// requirement.
 unsigned
 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                SelectionDAG& DAG) const {
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
   int64_t Offset = StackSize;
   unsigned SlotSize = RegInfo->getSlotSize();
   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
     // Number smaller than 12 so just add the difference.
     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
   } else {
     // Mask out lower bits, add stackalignment once plus the 12 bytes.
     Offset = ((~AlignMask) & Offset) + StackAlignment +
       (StackAlignment-SlotSize);
   }
   return Offset;
 }
 
 /// Return true if the given stack call argument is already available in the
 /// same position (relatively) of the caller's incoming argument stack.
 static
 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
                          const X86InstrInfo *TII, const CCValAssign &VA) {
   unsigned Bytes = Arg.getValueSizeInBits() / 8;
 
   for (;;) {
     // Look through nodes that don't alter the bits of the incoming value.
     unsigned Op = Arg.getOpcode();
     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
       Arg = Arg.getOperand(0);
       continue;
     }
     if (Op == ISD::TRUNCATE) {
       const SDValue &TruncInput = Arg.getOperand(0);
       if (TruncInput.getOpcode() == ISD::AssertZext &&
           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
               Arg.getValueType()) {
         Arg = TruncInput.getOperand(0);
         continue;
       }
     }
     break;
   }
 
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
     if (!TargetRegisterInfo::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
     if (!Def)
       return false;
     if (!Flags.isByVal()) {
       if (!TII->isLoadFromStackSlot(*Def, FI))
         return false;
     } else {
       unsigned Opcode = Def->getOpcode();
       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
            Opcode == X86::LEA64_32r) &&
           Def->getOperand(1).isFI()) {
         FI = Def->getOperand(1).getIndex();
         Bytes = Flags.getByValSize();
       } else
         return false;
     }
   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
     if (Flags.isByVal())
       // ByVal argument is passed in as a pointer but it's now being
       // dereferenced. e.g.
       // define @foo(%struct.X* %A) {
       //   tail call @bar(%struct.X* byval %A)
       // }
       return false;
     SDValue Ptr = Ld->getBasePtr();
     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
     if (!FINode)
       return false;
     FI = FINode->getIndex();
   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
     FI = FINode->getIndex();
     Bytes = Flags.getByValSize();
   } else
     return false;
 
   assert(FI != INT_MAX);
   if (!MFI.isFixedObjectIndex(FI))
     return false;
 
   if (Offset != MFI.getObjectOffset(FI))
     return false;
 
   // If this is not byval, check that the argument stack object is immutable.
   // inalloca and argument copy elision can create mutable argument stack
   // objects. Byval objects can be mutated, but a byval call intends to pass the
   // mutated memory.
   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
     return false;
 
   if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
     // If the argument location is wider than the argument type, check that any
     // extension flags match.
     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
         Flags.isSExt() != MFI.isObjectSExt(FI)) {
       return false;
     }
   }
 
   return Bytes == MFI.getObjectSize(FI);
 }
 
 /// Check whether the call is eligible for tail call optimization. Targets
 /// that want to do tail call optimization should implement this function.
 bool X86TargetLowering::IsEligibleForTailCallOptimization(
     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
   // If -tailcallopt is specified, make fastcc functions tail-callable.
   MachineFunction &MF = DAG.getMachineFunction();
   const Function &CallerF = MF.getFunction();
 
   // If the function return type is x86_fp80 and the callee return type is not,
   // then the FP_EXTEND of the call result is not a nop. It's not safe to
   // perform a tailcall optimization here.
   if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
     return false;
 
   CallingConv::ID CallerCC = CallerF.getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
 
   // Win64 functions have extra shadow space for argument homing. Don't do the
   // sibcall if the caller and callee have mismatched expectations for this
   // space.
   if (IsCalleeWin64 != IsCallerWin64)
     return false;
 
   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
     if (canGuaranteeTCO(CalleeCC) && CCMatch)
       return true;
     return false;
   }
 
   // Look for obvious safe cases to perform tail call optimization that do not
   // require ABI changes. This is what gcc calls sibcall.
 
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
   // Do not sibcall optimize vararg calls unless all arguments are passed via
   // registers.
   LLVMContext &C = *DAG.getContext();
   if (isVarArg && !Outs.empty()) {
     // Optimizing for varargs on Win64 is unlikely to be safe without
     // additional testing.
     if (IsCalleeWin64 || IsCallerWin64)
       return false;
 
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
       if (!ArgLocs[i].isRegLoc())
         return false;
   }
 
   // If the call result is in ST0 / ST1, it needs to be popped off the x87
   // stack.  Therefore, if it's not used by the call it is not safe to optimize
   // this into a sibcall.
   bool Unused = false;
   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
     if (!Ins[i].Used) {
       Unused = true;
       break;
     }
   }
   if (Unused) {
     SmallVector<CCValAssign, 16> RVLocs;
     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
       CCValAssign &VA = RVLocs[i];
       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
         return false;
     }
   }
 
   // Check that the call results are passed in the same way.
   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
                                   RetCC_X86, RetCC_X86))
     return false;
   // The callee has to preserve all registers the caller needs to preserve.
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   if (!CCMatch) {
     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
   }
 
   unsigned StackArgsSize = 0;
 
   // If the callee takes no arguments then go on to check the results of the
   // call.
   if (!Outs.empty()) {
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     // Allocate shadow area for Win64
     if (IsCalleeWin64)
       CCInfo.AllocateStack(32, 8);
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     StackArgsSize = CCInfo.getNextStackOffset();
 
     if (CCInfo.getNextStackOffset()) {
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
       MachineFrameInfo &MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII = Subtarget.getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
         ISD::ArgFlagsTy Flags = Outs[i].Flags;
         if (VA.getLocInfo() == CCValAssign::Indirect)
           return false;
         if (!VA.isRegLoc()) {
           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
                                    MFI, MRI, TII, VA))
             return false;
         }
       }
     }
 
     bool PositionIndependent = isPositionIndependent();
     // If the tailcall address may be in a register, then make sure it's
     // possible to register allocate for it. In 32-bit, the call address can
     // only target EAX, EDX, or ECX since the tail call must be scheduled after
     // callee-saved registers are restored. These happen to be the same
     // registers used to pass 'inreg' arguments so watch out for those.
     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
                                   !isa<ExternalSymbolSDNode>(Callee)) ||
                                  PositionIndependent)) {
       unsigned NumInRegs = 0;
       // In PIC we need an extra register to formulate the address computation
       // for the callee.
       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
 
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         if (!VA.isRegLoc())
           continue;
         unsigned Reg = VA.getLocReg();
         switch (Reg) {
         default: break;
         case X86::EAX: case X86::EDX: case X86::ECX:
           if (++NumInRegs == MaxInRegs)
             return false;
           break;
         }
       }
     }
 
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
       return false;
   }
 
   bool CalleeWillPop =
       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
                        MF.getTarget().Options.GuaranteedTailCallOpt);
 
   if (unsigned BytesToPop =
           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
     // If we have bytes to pop, the callee must pop them.
     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
     if (!CalleePopMatches)
       return false;
   } else if (CalleeWillPop && StackArgsSize > 0) {
     // If we don't have bytes to pop, make sure the callee doesn't pop any.
     return false;
   }
 
   return true;
 }
 
 FastISel *
 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                   const TargetLibraryInfo *libInfo) const {
   return X86::createFastISel(funcInfo, libInfo);
 }
 
 //===----------------------------------------------------------------------===//
 //                           Other Lowering Hooks
 //===----------------------------------------------------------------------===//
 
 static bool MayFoldLoad(SDValue Op) {
   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
 }
 
 static bool MayFoldIntoStore(SDValue Op) {
   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
 }
 
 static bool MayFoldIntoZeroExtend(SDValue Op) {
   if (Op.hasOneUse()) {
     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
     return (ISD::ZERO_EXTEND == Opcode);
   }
   return false;
 }
 
 static bool isTargetShuffle(unsigned Opcode) {
   switch(Opcode) {
   default: return false;
   case X86ISD::BLENDI:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::SHUFP:
   case X86ISD::INSERTPS:
   case X86ISD::EXTRQI:
   case X86ISD::INSERTQI:
   case X86ISD::PALIGNR:
   case X86ISD::VSHLDQ:
   case X86ISD::VSRLDQ:
   case X86ISD::MOVLHPS:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVSHDUP:
   case X86ISD::MOVSLDUP:
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
   case X86ISD::VBROADCAST:
   case X86ISD::VPERMILPI:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERM2X128:
   case X86ISD::SHUF128:
   case X86ISD::VPERMIL2:
   case X86ISD::VPERMI:
   case X86ISD::VPPERM:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
   case X86ISD::VZEXT_MOVL:
     return true;
   }
 }
 
 static bool isTargetShuffleVariableMask(unsigned Opcode) {
   switch (Opcode) {
   default: return false;
   // Target Shuffles.
   case X86ISD::PSHUFB:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERMIL2:
   case X86ISD::VPPERM:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
     return true;
   // 'Faux' Target Shuffles.
   case ISD::OR:
   case ISD::AND:
   case X86ISD::ANDNP:
     return true;
   }
 }
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
     unsigned SlotSize = RegInfo->getSlotSize();
     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
                                                           -(int64_t)SlotSize,
                                                           false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
 }
 
 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                        bool hasSymbolicDisplacement) {
   // Offset should fit into 32 bit immediate field.
   if (!isInt<32>(Offset))
     return false;
 
   // If we don't have a symbolic displacement - we don't have any extra
   // restrictions.
   if (!hasSymbolicDisplacement)
     return true;
 
   // FIXME: Some tweaks might be needed for medium code model.
   if (M != CodeModel::Small && M != CodeModel::Kernel)
     return false;
 
   // For small code model we assume that latest object is 16MB before end of 31
   // bits boundary. We may also accept pretty large negative constants knowing
   // that all objects are in the positive half of address space.
   if (M == CodeModel::Small && Offset < 16*1024*1024)
     return true;
 
   // For kernel code model we know that all object resist in the negative half
   // of 32bits address space. We may not accept negative offsets, since they may
   // be just off and we may accept pretty large positive ones.
   if (M == CodeModel::Kernel && Offset >= 0)
     return true;
 
   return false;
 }
 
 /// Determines whether the callee is required to pop its own arguments.
 /// Callee pop is necessary to support tail calls.
 bool X86::isCalleePop(CallingConv::ID CallingConv,
                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
   // can guarantee TCO.
   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
     return true;
 
   switch (CallingConv) {
   default:
     return false;
   case CallingConv::X86_StdCall:
   case CallingConv::X86_FastCall:
   case CallingConv::X86_ThisCall:
   case CallingConv::X86_VectorCall:
     return !is64Bit;
   }
 }
 
 /// Return true if the condition is an unsigned comparison operation.
 static bool isX86CCUnsigned(unsigned X86CC) {
   switch (X86CC) {
   default:
     llvm_unreachable("Invalid integer condition!");
   case X86::COND_E:
   case X86::COND_NE:
   case X86::COND_B:
   case X86::COND_A:
   case X86::COND_BE:
   case X86::COND_AE:
     return true;
   case X86::COND_G:
   case X86::COND_GE:
   case X86::COND_L:
   case X86::COND_LE:
     return false;
   }
 }
 
 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
   switch (SetCCOpcode) {
   default: llvm_unreachable("Invalid integer condition!");
   case ISD::SETEQ:  return X86::COND_E;
   case ISD::SETGT:  return X86::COND_G;
   case ISD::SETGE:  return X86::COND_GE;
   case ISD::SETLT:  return X86::COND_L;
   case ISD::SETLE:  return X86::COND_LE;
   case ISD::SETNE:  return X86::COND_NE;
   case ISD::SETULT: return X86::COND_B;
   case ISD::SETUGT: return X86::COND_A;
   case ISD::SETULE: return X86::COND_BE;
   case ISD::SETUGE: return X86::COND_AE;
   }
 }
 
 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
 /// condition code, returning the condition code and the LHS/RHS of the
 /// comparison to make.
 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
                                bool isFP, SDValue &LHS, SDValue &RHS,
                                SelectionDAG &DAG) {
   if (!isFP) {
     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
         // X > -1   -> X == 0, jump !sign.
         RHS = DAG.getConstant(0, DL, RHS.getValueType());
         return X86::COND_NS;
       }
       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
         // X < 0   -> X == 0, jump on sign.
         return X86::COND_S;
       }
       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
         // X < 1   -> X <= 0
         RHS = DAG.getConstant(0, DL, RHS.getValueType());
         return X86::COND_LE;
       }
     }
 
     return TranslateIntegerX86CC(SetCCOpcode);
   }
 
   // First determine if it is required or is profitable to flip the operands.
 
   // If LHS is a foldable load, but RHS is not, flip the condition.
   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
       !ISD::isNON_EXTLoad(RHS.getNode())) {
     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
     std::swap(LHS, RHS);
   }
 
   switch (SetCCOpcode) {
   default: break;
   case ISD::SETOLT:
   case ISD::SETOLE:
   case ISD::SETUGT:
   case ISD::SETUGE:
     std::swap(LHS, RHS);
     break;
   }
 
   // On a floating point condition, the flags are set as follows:
   // ZF  PF  CF   op
   //  0 | 0 | 0 | X > Y
   //  0 | 0 | 1 | X < Y
   //  1 | 0 | 0 | X == Y
   //  1 | 1 | 1 | unordered
   switch (SetCCOpcode) {
   default: llvm_unreachable("Condcode should be pre-legalized away");
   case ISD::SETUEQ:
   case ISD::SETEQ:   return X86::COND_E;
   case ISD::SETOLT:              // flipped
   case ISD::SETOGT:
   case ISD::SETGT:   return X86::COND_A;
   case ISD::SETOLE:              // flipped
   case ISD::SETOGE:
   case ISD::SETGE:   return X86::COND_AE;
   case ISD::SETUGT:              // flipped
   case ISD::SETULT:
   case ISD::SETLT:   return X86::COND_B;
   case ISD::SETUGE:              // flipped
   case ISD::SETULE:
   case ISD::SETLE:   return X86::COND_BE;
   case ISD::SETONE:
   case ISD::SETNE:   return X86::COND_NE;
   case ISD::SETUO:   return X86::COND_P;
   case ISD::SETO:    return X86::COND_NP;
   case ISD::SETOEQ:
   case ISD::SETUNE:  return X86::COND_INVALID;
   }
 }
 
 /// Is there a floating point cmov for the specific X86 condition code?
 /// Current x86 isa includes the following FP cmov instructions:
 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
 static bool hasFPCMov(unsigned X86CC) {
   switch (X86CC) {
   default:
     return false;
   case X86::COND_B:
   case X86::COND_BE:
   case X86::COND_E:
   case X86::COND_P:
   case X86::COND_A:
   case X86::COND_AE:
   case X86::COND_NE:
   case X86::COND_NP:
     return true;
   }
 }
 
 
 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
                                            MachineFunction &MF,
                                            unsigned Intrinsic) const {
 
   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
   if (!IntrData)
     return false;
 
   Info.opc = ISD::INTRINSIC_W_CHAIN;
   Info.flags = MachineMemOperand::MONone;
   Info.offset = 0;
 
   switch (IntrData->Type) {
   case TRUNCATE_TO_MEM_VI8:
   case TRUNCATE_TO_MEM_VI16:
   case TRUNCATE_TO_MEM_VI32: {
     Info.ptrVal = I.getArgOperand(0);
     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
       ScalarVT = MVT::i8;
     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
       ScalarVT = MVT::i16;
     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
       ScalarVT = MVT::i32;
 
     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
     Info.align = 1;
     Info.flags |= MachineMemOperand::MOStore;
     break;
   }
   default:
     return false;
   }
 
   return true;
 }
 
 /// Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
       return true;
   }
   return false;
 }
 
 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
                                               ISD::LoadExtType ExtTy,
                                               EVT NewVT) const {
   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
   // relocation target a movq or addq instruction: don't let the load shrink.
   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
   return true;
 }
 
 /// Returns true if it is beneficial to convert a load of a constant
 /// to just the constant itself.
 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                           Type *Ty) const {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   if (BitSize == 0 || BitSize > 64)
     return false;
   return true;
 }
 
 bool X86TargetLowering::reduceSelectOfFPConstantLoads(bool IsFPSetCC) const {
   // If we are using XMM registers in the ABI and the condition of the select is
   // a floating-point compare and we have blendv or conditional move, then it is
   // cheaper to select instead of doing a cross-register move and creating a
   // load that depends on the compare result.
   return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
 }
 
 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
   // TODO: It might be a win to ease or lift this restriction, but the generic
   // folds in DAGCombiner conflict with vector folds for an AVX512 target.
   if (VT.isVector() && Subtarget.hasAVX512())
     return false;
 
   return true;
 }
 
 bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const {
   // TODO: We handle scalars using custom code, but generic combining could make
   // that unnecessary.
   APInt MulC;
   if (!ISD::isConstantSplatVector(C.getNode(), MulC))
     return false;
 
   // If vector multiply is legal, assume that's faster than shl + add/sub.
   // TODO: Multiply is a complex op with higher latency and lower througput in
   //       most implementations, so this check could be loosened based on type
   //       and/or a CPU attribute.
   if (isOperationLegal(ISD::MUL, VT))
     return false;
 
   // shl+add, shl+sub, shl+add+neg
   return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
 }
 
 bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
                                                  bool IsSigned) const {
   // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
   return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
 }
 
 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                                 unsigned Index) const {
   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
     return false;
 
   // Mask vectors support all subregister combinations and operations that
   // extract half of vector.
   if (ResVT.getVectorElementType() == MVT::i1)
     return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
                           (Index == ResVT.getVectorNumElements()));
 
   return (Index % ResVT.getVectorNumElements()) == 0;
 }
 
 bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
   // If the vector op is not supported, try to convert to scalar.
   EVT VecVT = VecOp.getValueType();
   if (!isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), VecVT))
     return true;
 
   // If the vector op is supported, but the scalar op is not, the transform may
   // not be worthwhile.
   EVT ScalarVT = VecVT.getScalarType();
   return isOperationLegalOrCustomOrPromote(VecOp.getOpcode(), ScalarVT);
 }
 
 bool X86TargetLowering::isCheapToSpeculateCttz() const {
   // Speculate cttz only if we can directly use TZCNT.
   return Subtarget.hasBMI();
 }
 
 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
   // Speculate ctlz only if we can directly use LZCNT.
   return Subtarget.hasLZCNT();
 }
 
 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
                                                 EVT BitcastVT) const {
   if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
       BitcastVT.getVectorElementType() == MVT::i1)
     return false;
 
   if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
     return false;
 
   return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
 }
 
 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                                          const SelectionDAG &DAG) const {
   // Do not merge to float value size (128 bytes) if no implicit
   // float attribute is set.
   bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
       Attribute::NoImplicitFloat);
 
   if (NoFloat) {
     unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
     return (MemVT.getSizeInBits() <= MaxIntSize);
   }
   return true;
 }
 
 bool X86TargetLowering::isCtlzFast() const {
   return Subtarget.hasFastLZCNT();
 }
 
 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
     const Instruction &AndI) const {
   return true;
 }
 
 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   EVT VT = Y.getValueType();
 
   if (VT.isVector())
     return false;
 
   if (!Subtarget.hasBMI())
     return false;
 
   // There are only 32-bit and 64-bit forms for 'andn'.
   if (VT != MVT::i32 && VT != MVT::i64)
     return false;
 
   return !isa<ConstantSDNode>(Y);
 }
 
 bool X86TargetLowering::hasAndNot(SDValue Y) const {
   EVT VT = Y.getValueType();
 
   if (!VT.isVector())
     return hasAndNotCompare(Y);
 
   // Vector.
 
   if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
     return false;
 
   if (VT == MVT::v4i32)
     return true;
 
   return Subtarget.hasSSE2();
 }
 
 bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
   EVT VT = Y.getValueType();
 
   // For vectors, we don't have a preference, but we probably want a mask.
   if (VT.isVector())
     return false;
 
   // 64-bit shifts on 32-bit targets produce really bad bloated code.
   if (VT == MVT::i64 && !Subtarget.is64Bit())
     return false;
 
   return true;
 }
 
 bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {
   // Any legal vector type can be splatted more efficiently than
   // loading/spilling from memory.
   return isTypeLegal(VT);
 }
 
 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
   MVT VT = MVT::getIntegerVT(NumBits);
   if (isTypeLegal(VT))
     return VT;
 
   // PMOVMSKB can handle this.
   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
     return MVT::v16i8;
 
   // VPMOVMSKB can handle this.
   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
     return MVT::v32i8;
 
   // TODO: Allow 64-bit type for 32-bit target.
   // TODO: 512-bit types should be allowed, but make sure that those
   // cases are handled in combineVectorSizedSetCCEquality().
 
   return MVT::INVALID_SIMPLE_VALUE_TYPE;
 }
 
 /// Val is the undef sentinel value or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
 }
 
 /// Val is either the undef or zero sentinel value.
 static bool isUndefOrZero(int Val) {
   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
 }
 
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size is the undef sentinel value.
 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
     if (Mask[i] != SM_SentinelUndef)
       return false;
   return true;
 }
 
 /// Return true if Val falls within the specified range (L, H].
 static bool isInRange(int Val, int Low, int Hi) {
   return (Val >= Low && Val < Hi);
 }
 
 /// Return true if the value of any element in Mask falls within the specified
 /// range (L, H].
 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
   for (int M : Mask)
     if (isInRange(M, Low, Hi))
       return true;
   return false;
 }
 
 /// Return true if Val is undef or if its value falls within the
 /// specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
   return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
 }
 
 /// Return true if every element in Mask is undef or if its value
 /// falls within the specified range (L, H].
 static bool isUndefOrInRange(ArrayRef<int> Mask,
                              int Low, int Hi) {
   for (int M : Mask)
     if (!isUndefOrInRange(M, Low, Hi))
       return false;
   return true;
 }
 
 /// Return true if Val is undef, zero or if its value falls within the
 /// specified range (L, H].
 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
   return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
 }
 
 /// Return true if every element in Mask is undef, zero or if its value
 /// falls within the specified range (L, H].
 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
   for (int M : Mask)
     if (!isUndefOrZeroOrInRange(M, Low, Hi))
       return false;
   return true;
 }
 
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos + Size, falls within the specified
 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
                                        unsigned Size, int Low, int Step = 1) {
   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
     if (!isUndefOrEqual(Mask[i], Low))
       return false;
   return true;
 }
 
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size, falls within the specified
 /// sequential range (Low, Low+Size], or is undef or is zero.
 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
                                              unsigned Size, int Low) {
   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
       return false;
   return true;
 }
 
 /// Return true if every element in Mask, beginning
 /// from position Pos and ending in Pos+Size is undef or is zero.
 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
                                  unsigned Size) {
   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
     if (!isUndefOrZero(Mask[i]))
       return false;
   return true;
 }
 
 /// Helper function to test whether a shuffle mask could be
 /// simplified by widening the elements being shuffled.
 ///
 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
 /// leaves it in an unspecified state.
 ///
 /// NOTE: This must handle normal vector shuffle masks and *target* vector
 /// shuffle masks. The latter have the special property of a '-2' representing
 /// a zero-ed lane of a vector.
 static bool canWidenShuffleElements(ArrayRef<int> Mask,
                                     SmallVectorImpl<int> &WidenedMask) {
   WidenedMask.assign(Mask.size() / 2, 0);
   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
     int M0 = Mask[i];
     int M1 = Mask[i + 1];
 
     // If both elements are undef, its trivial.
     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
       WidenedMask[i / 2] = SM_SentinelUndef;
       continue;
     }
 
     // Check for an undef mask and a mask value properly aligned to fit with
     // a pair of values. If we find such a case, use the non-undef mask's value.
     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
       WidenedMask[i / 2] = M1 / 2;
       continue;
     }
     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
       WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
     // When zeroing, we need to spread the zeroing across both lanes to widen.
     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
         WidenedMask[i / 2] = SM_SentinelZero;
         continue;
       }
       return false;
     }
 
     // Finally check if the two mask values are adjacent and aligned with
     // a pair.
     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
       WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
     // Otherwise we can't safely widen the elements used in this shuffle.
     return false;
   }
   assert(WidenedMask.size() == Mask.size() / 2 &&
          "Incorrect size of mask after widening the elements!");
 
   return true;
 }
 
 static bool canWidenShuffleElements(ArrayRef<int> Mask,
                                     const APInt &Zeroable,
                                     SmallVectorImpl<int> &WidenedMask) {
   SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
   for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
     if (TargetMask[i] == SM_SentinelUndef)
       continue;
     if (Zeroable[i])
       TargetMask[i] = SM_SentinelZero;
   }
   return canWidenShuffleElements(TargetMask, WidenedMask);
 }
 
 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
   SmallVector<int, 32> WidenedMask;
   return canWidenShuffleElements(Mask, WidenedMask);
 }
 
 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
   return isNullConstant(Elt) || isNullFPConstant(Elt);
 }
 
 // Build a vector of constants.
 // Use an UNDEF node if MaskElt == -1.
 // Split 64-bit constants in the 32-bit mode.
 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
                               const SDLoc &dl, bool IsMask = false) {
 
   SmallVector<SDValue, 32>  Ops;
   bool Split = false;
 
   MVT ConstVecVT = VT;
   unsigned NumElts = VT.getVectorNumElements();
   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
     Split = true;
   }
 
   MVT EltVT = ConstVecVT.getVectorElementType();
   for (unsigned i = 0; i < NumElts; ++i) {
     bool IsUndef = Values[i] < 0 && IsMask;
     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
       DAG.getConstant(Values[i], dl, EltVT);
     Ops.push_back(OpNode);
     if (Split)
       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
                     DAG.getConstant(0, dl, EltVT));
   }
   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   if (Split)
     ConstsNode = DAG.getBitcast(VT, ConstsNode);
   return ConstsNode;
 }
 
 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert(Bits.size() == Undefs.getBitWidth() &&
          "Unequal constant and undef arrays");
   SmallVector<SDValue, 32> Ops;
   bool Split = false;
 
   MVT ConstVecVT = VT;
   unsigned NumElts = VT.getVectorNumElements();
   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
     Split = true;
   }
 
   MVT EltVT = ConstVecVT.getVectorElementType();
   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
     if (Undefs[i]) {
       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
       continue;
     }
     const APInt &V = Bits[i];
     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
     if (Split) {
       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
     } else if (EltVT == MVT::f32) {
       APFloat FV(APFloat::IEEEsingle(), V);
       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
     } else if (EltVT == MVT::f64) {
       APFloat FV(APFloat::IEEEdouble(), V);
       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
     } else {
       Ops.push_back(DAG.getConstant(V, dl, EltVT));
     }
   }
 
   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
   return DAG.getBitcast(VT, ConstsNode);
 }
 
 /// Returns a vector of specified type with all zero elements.
 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
           VT.getVectorElementType() == MVT::i1) &&
          "Unexpected vector type");
 
   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
   // type. This ensures they get CSE'd. But if the integer type is not
   // available, use a floating-point +0.0 instead.
   SDValue Vec;
   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
   } else if (VT.getVectorElementType() == MVT::i1) {
     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
            "Unexpected vector type");
     Vec = DAG.getConstant(0, dl, VT);
   } else {
     unsigned Num32BitElts = VT.getSizeInBits() / 32;
     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
   }
   return DAG.getBitcast(VT, Vec);
 }
 
 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
                                 const SDLoc &dl, unsigned vectorWidth) {
   EVT VT = Vec.getValueType();
   EVT ElVT = VT.getVectorElementType();
   unsigned Factor = VT.getSizeInBits()/vectorWidth;
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                   VT.getVectorNumElements()/Factor);
 
   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
   // This is the index of the first element of the vectorWidth-bit chunk
   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   IdxVal &= ~(ElemsPerChunk - 1);
 
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
     return DAG.getBuildVector(ResultVT, dl,
                               Vec->ops().slice(IdxVal, ElemsPerChunk));
 
   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
 }
 
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
 /// instructions or a simple subregister reference. Idx is an index in the
 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering EXTRACT_VECTOR_ELT operations easier.
 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
                                    SelectionDAG &DAG, const SDLoc &dl) {
   assert((Vec.getValueType().is256BitVector() ||
           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
 }
 
 /// Generate a DAG to grab 256-bits from a 512-bit vector.
 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
                                    SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
 }
 
 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                SelectionDAG &DAG, const SDLoc &dl,
                                unsigned vectorWidth) {
   assert((vectorWidth == 128 || vectorWidth == 256) &&
          "Unsupported vector width");
   // Inserting UNDEF is Result
   if (Vec.isUndef())
     return Result;
   EVT VT = Vec.getValueType();
   EVT ElVT = VT.getVectorElementType();
   EVT ResultVT = Result.getValueType();
 
   // Insert the relevant vectorWidth bits.
   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
   // This is the index of the first element of the vectorWidth-bit chunk
   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
   IdxVal &= ~(ElemsPerChunk - 1);
 
   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
 }
 
 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
 /// simple superregister reference.  Idx is an index in the 128 bits
 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                   SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
 /// Widen a vector to a larger size with the same scalar type, with the new
 /// elements either zero or undef.
 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
                               const X86Subtarget &Subtarget, SelectionDAG &DAG,
                               const SDLoc &dl) {
   assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
          Vec.getValueType().getScalarType() == VT.getScalarType() &&
          "Unsupported vector widening type");
   SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
                                 : DAG.getUNDEF(VT);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 // Helper for splitting operands of an operation to legal target size and
 // apply a function on each part.
 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
 // The argument Builder is a function that will be applied on each split part:
 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
 template <typename F>
 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
                          const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
                          F Builder, bool CheckBWI = true) {
   assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
   unsigned NumSubs = 1;
   if ((CheckBWI && Subtarget.useBWIRegs()) ||
       (!CheckBWI && Subtarget.useAVX512Regs())) {
     if (VT.getSizeInBits() > 512) {
       NumSubs = VT.getSizeInBits() / 512;
       assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
     }
   } else if (Subtarget.hasAVX2()) {
     if (VT.getSizeInBits() > 256) {
       NumSubs = VT.getSizeInBits() / 256;
       assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
     }
   } else {
     if (VT.getSizeInBits() > 128) {
       NumSubs = VT.getSizeInBits() / 128;
       assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
     }
   }
 
   if (NumSubs == 1)
     return Builder(DAG, DL, Ops);
 
   SmallVector<SDValue, 4> Subs;
   for (unsigned i = 0; i != NumSubs; ++i) {
     SmallVector<SDValue, 2> SubOps;
     for (SDValue Op : Ops) {
       EVT OpVT = Op.getValueType();
       unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
       unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
       SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
     }
     Subs.push_back(Builder(DAG, DL, SubOps));
   }
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
 }
 
 // Return true if the instruction zeroes the unused upper part of the
 // destination and accepts mask.
 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
   switch (Opcode) {
   default:
     return false;
   case X86ISD::CMPM:
   case X86ISD::CMPM_RND:
   case ISD::SETCC:
     return true;
   }
 }
 
 /// Insert i1-subvector to i1-vector.
 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
 
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue SubVec = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
 
   if (!isa<ConstantSDNode>(Idx))
     return SDValue();
 
   // Inserting undef is a nop. We can just return the original vector.
   if (SubVec.isUndef())
     return Vec;
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
     return Op;
 
   MVT OpVT = Op.getSimpleValueType();
   unsigned NumElems = OpVT.getVectorNumElements();
 
   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
 
   // Extend to natively supported kshift.
   MVT WideOpVT = OpVT;
   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
     WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
 
   // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
   // if necessary.
   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
     // May need to promote to a legal type.
     Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                      getZeroVector(WideOpVT, Subtarget, DAG, dl),
                      SubVec, Idx);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
   MVT SubVecVT = SubVec.getSimpleValueType();
   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
 
   assert(IdxVal + SubVecNumElems <= NumElems &&
          IdxVal % SubVecVT.getSizeInBits() == 0 &&
          "Unexpected index value in INSERT_SUBVECTOR");
 
   SDValue Undef = DAG.getUNDEF(WideOpVT);
 
   if (IdxVal == 0) {
     // Zero lower bits of the Vec
     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
                       ZeroIdx);
     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
     // Merge them together, SubVec should be zero extended.
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                          getZeroVector(WideOpVT, Subtarget, DAG, dl),
                          SubVec, ZeroIdx);
     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
   SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                        Undef, SubVec, ZeroIdx);
 
   if (Vec.isUndef()) {
     assert(IdxVal != 0 && "Unexpected index");
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
                          DAG.getConstant(IdxVal, dl, MVT::i8));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   }
 
   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
     assert(IdxVal != 0 && "Unexpected index");
     NumElems = WideOpVT.getVectorNumElements();
     unsigned ShiftLeft = NumElems - SubVecNumElems;
     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
                          DAG.getConstant(ShiftLeft, dl, MVT::i8));
     if (ShiftRight != 0)
       SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
                            DAG.getConstant(ShiftRight, dl, MVT::i8));
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
   }
 
   // Simple case when we put subvector in the upper part
   if (IdxVal + SubVecNumElems == NumElems) {
     SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
                          DAG.getConstant(IdxVal, dl, MVT::i8));
     if (SubVecNumElems * 2 == NumElems) {
       // Special case, use legal zero extending insert_subvector. This allows
       // isel to opimitize when bits are known zero.
       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                         getZeroVector(WideOpVT, Subtarget, DAG, dl),
                         Vec, ZeroIdx);
     } else {
       // Otherwise use explicit shifts to zero the bits.
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                         Undef, Vec, ZeroIdx);
       NumElems = WideOpVT.getVectorNumElements();
       SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
       Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
       Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
     }
     Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
   }
 
   // Inserting into the middle is more complicated.
 
   NumElems = WideOpVT.getVectorNumElements();
 
   // Widen the vector if needed.
   Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
   // Move the current value of the bit to be replace to the lsbs.
   Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
                    DAG.getConstant(IdxVal, dl, MVT::i8));
   // Xor with the new bit.
   Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
   // Shift to MSB, filling bottom bits with 0.
   unsigned ShiftLeft = NumElems - SubVecNumElems;
   Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
                    DAG.getConstant(ShiftLeft, dl, MVT::i8));
   // Shift to the final position, filling upper bits with 0.
   unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
   Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
                        DAG.getConstant(ShiftRight, dl, MVT::i8));
   // Xor with original vector leaving the new value.
   Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
   // Reduce to original width if needed.
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
 }
 
 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
                                 unsigned NumElems, SelectionDAG &DAG,
                                 const SDLoc &dl, unsigned VectorWidth) {
   SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
   return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
 }
 
 /// Returns a vector of specified type with all bits set.
 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
 /// Then bitcast to their original type, ensuring they get CSE'd.
 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
 
   APInt Ones = APInt::getAllOnesValue(32);
   unsigned NumElts = VT.getSizeInBits() / 32;
   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
   return DAG.getBitcast(VT, Vec);
 }
 
 static SDValue getExtendInVec(bool Signed, const SDLoc &DL, EVT VT, SDValue In,
                               SelectionDAG &DAG) {
   EVT InVT = In.getValueType();
   assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
 
   // For 256-bit vectors, we only need the lower (128-bit) input half.
   // For 512-bit vectors, we only need the lower input half or quarter.
   if (InVT.getSizeInBits() > 128) {
     assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
            "Expected VTs to be the same size!");
     unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
     In = extractSubVector(In, 0, DAG, DL,
                           std::max(128U, VT.getSizeInBits() / Scale));
     InVT = In.getValueType();
   }
 
   if (VT.getVectorNumElements() == InVT.getVectorNumElements())
     return DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
                        DL, VT, In);
 
   return DAG.getNode(Signed ? ISD::SIGN_EXTEND_VECTOR_INREG
                             : ISD::ZERO_EXTEND_VECTOR_INREG,
                      DL, VT, In);
 }
 
 /// Returns a vector_shuffle node for an unpackl operation.
 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
                           SDValue V1, SDValue V2) {
   SmallVector<int, 8> Mask;
   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 /// Returns a vector_shuffle node for an unpackh operation.
 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
                           SDValue V1, SDValue V2) {
   SmallVector<int, 8> Mask;
   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
 }
 
 /// Return a vector_shuffle of the specified vector of zero or undef vector.
 /// This produces a shuffle where the low element of V2 is swizzled into the
 /// zero/undef vector, landing at element Idx.
 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
                                            bool IsZero,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   MVT VT = V2.getSimpleValueType();
   SDValue V1 = IsZero
     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   int NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec(NumElems);
   for (int i = 0; i != NumElems; ++i)
     // If this is the insertion idx, put the low elt of V2 here.
     MaskVec[i] = (i == Idx) ? NumElems : i;
   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
 }
 
 // Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
 static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
   while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
     V = V.getOperand(0);
   return V;
 }
 
 static const Constant *getTargetConstantFromNode(SDValue Op) {
   Op = peekThroughBitcasts(Op);
 
   auto *Load = dyn_cast<LoadSDNode>(Op);
   if (!Load)
     return nullptr;
 
   SDValue Ptr = Load->getBasePtr();
   if (Ptr->getOpcode() == X86ISD::Wrapper ||
       Ptr->getOpcode() == X86ISD::WrapperRIP)
     Ptr = Ptr->getOperand(0);
 
   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
     return nullptr;
 
   return CNode->getConstVal();
 }
 
 // Extract raw constant bits from constant pools.
 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
                                           APInt &UndefElts,
                                           SmallVectorImpl<APInt> &EltBits,
                                           bool AllowWholeUndefs = true,
                                           bool AllowPartialUndefs = true) {
   assert(EltBits.empty() && "Expected an empty EltBits vector");
 
   Op = peekThroughBitcasts(Op);
 
   EVT VT = Op.getValueType();
   unsigned SizeInBits = VT.getSizeInBits();
   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
   unsigned NumElts = SizeInBits / EltSizeInBits;
 
   // Bitcast a source array of element bits to the target size.
   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
            "Constant bit sizes don't match");
 
     // Don't split if we don't allow undef bits.
     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
       return false;
 
     // If we're already the right size, don't bother bitcasting.
     if (NumSrcElts == NumElts) {
       UndefElts = UndefSrcElts;
       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
       return true;
     }
 
     // Extract all the undef/constant element data and pack into single bitsets.
     APInt UndefBits(SizeInBits, 0);
     APInt MaskBits(SizeInBits, 0);
 
     for (unsigned i = 0; i != NumSrcElts; ++i) {
       unsigned BitOffset = i * SrcEltSizeInBits;
       if (UndefSrcElts[i])
         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
       MaskBits.insertBits(SrcEltBits[i], BitOffset);
     }
 
     // Split the undef/constant single bitset data into the target elements.
     UndefElts = APInt(NumElts, 0);
     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
 
     for (unsigned i = 0; i != NumElts; ++i) {
       unsigned BitOffset = i * EltSizeInBits;
       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
 
       // Only treat an element as UNDEF if all bits are UNDEF.
       if (UndefEltBits.isAllOnesValue()) {
         if (!AllowWholeUndefs)
           return false;
         UndefElts.setBit(i);
         continue;
       }
 
       // If only some bits are UNDEF then treat them as zero (or bail if not
       // supported).
       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
         return false;
 
       APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
       EltBits[i] = Bits.getZExtValue();
     }
     return true;
   };
 
   // Collect constant bits and insert into mask/undef bit masks.
   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
                                 unsigned UndefBitIndex) {
     if (!Cst)
       return false;
     if (isa<UndefValue>(Cst)) {
       Undefs.setBit(UndefBitIndex);
       return true;
     }
     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
       Mask = CInt->getValue();
       return true;
     }
     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
       Mask = CFP->getValueAPF().bitcastToAPInt();
       return true;
     }
     return false;
   };
 
   // Handle UNDEFs.
   if (Op.isUndef()) {
     APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
     SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract scalar constant bits.
   if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
     APInt UndefSrcElts = APInt::getNullValue(1);
     SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
   if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
     APInt UndefSrcElts = APInt::getNullValue(1);
     APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
     SmallVector<APInt, 64> SrcEltBits(1, RawBits);
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from build vector.
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       const SDValue &Src = Op.getOperand(i);
       if (Src.isUndef()) {
         UndefSrcElts.setBit(i);
         continue;
       }
       auto *Cst = cast<ConstantSDNode>(Src);
       SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
     }
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
   if (ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) {
     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       const SDValue &Src = Op.getOperand(i);
       if (Src.isUndef()) {
         UndefSrcElts.setBit(i);
         continue;
       }
       auto *Cst = cast<ConstantFPSDNode>(Src);
       APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
       SrcEltBits[i] = RawBits.zextOrTrunc(SrcEltSizeInBits);
     }
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from constant pool vector.
   if (auto *Cst = getTargetConstantFromNode(Op)) {
     Type *CstTy = Cst->getType();
     unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
     if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
       return false;
 
     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
     for (unsigned i = 0; i != NumSrcElts; ++i)
       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
                                UndefSrcElts, i))
         return false;
 
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from a broadcasted constant pool scalar.
   if (Op.getOpcode() == X86ISD::VBROADCAST &&
       EltSizeInBits <= VT.getScalarSizeInBits()) {
     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
       unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
       APInt UndefSrcElts(NumSrcElts, 0);
       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
       if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
         if (UndefSrcElts[0])
           UndefSrcElts.setBits(0, NumSrcElts);
         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
         return CastBitData(UndefSrcElts, SrcEltBits);
       }
     }
   }
 
   // Extract a rematerialized scalar constant insertion.
   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
     APInt UndefSrcElts(NumSrcElts, 0);
     SmallVector<APInt, 64> SrcEltBits;
     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
     return CastBitData(UndefSrcElts, SrcEltBits);
   }
 
   // Extract constant bits from a subvector's source.
   if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       isa<ConstantSDNode>(Op.getOperand(1))) {
     // TODO - support extract_subvector through bitcasts.
     if (EltSizeInBits != VT.getScalarSizeInBits())
       return false;
 
     if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
                                       UndefElts, EltBits, AllowWholeUndefs,
                                       AllowPartialUndefs)) {
       EVT SrcVT = Op.getOperand(0).getValueType();
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
       unsigned NumSubElts = VT.getVectorNumElements();
       unsigned BaseIdx = Op.getConstantOperandVal(1);
       UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
       if ((BaseIdx + NumSubElts) != NumSrcElts)
         EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
       if (BaseIdx != 0)
         EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
       return true;
     }
   }
 
   // Extract constant bits from shuffle node sources.
   if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
     // TODO - support shuffle through bitcasts.
     if (EltSizeInBits != VT.getScalarSizeInBits())
       return false;
 
     ArrayRef<int> Mask = SVN->getMask();
     if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
         llvm::any_of(Mask, [](int M) { return M < 0; }))
       return false;
 
     APInt UndefElts0, UndefElts1;
     SmallVector<APInt, 32> EltBits0, EltBits1;
     if (isAnyInRange(Mask, 0, NumElts) &&
         !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
                                        UndefElts0, EltBits0, AllowWholeUndefs,
                                        AllowPartialUndefs))
       return false;
     if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
         !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
                                        UndefElts1, EltBits1, AllowWholeUndefs,
                                        AllowPartialUndefs))
       return false;
 
     UndefElts = APInt::getNullValue(NumElts);
     for (int i = 0; i != (int)NumElts; ++i) {
       int M = Mask[i];
       if (M < 0) {
         UndefElts.setBit(i);
         EltBits.push_back(APInt::getNullValue(EltSizeInBits));
       } else if (M < (int)NumElts) {
         if (UndefElts0[M])
           UndefElts.setBit(i);
         EltBits.push_back(EltBits0[M]);
       } else {
         if (UndefElts1[M - NumElts])
           UndefElts.setBit(i);
         EltBits.push_back(EltBits1[M - NumElts]);
       }
     }
     return true;
   }
 
   return false;
 }
 
 static bool isConstantSplat(SDValue Op, APInt &SplatVal) {
   APInt UndefElts;
   SmallVector<APInt, 16> EltBits;
   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
                                     UndefElts, EltBits, true, false)) {
     int SplatIndex = -1;
     for (int i = 0, e = EltBits.size(); i != e; ++i) {
       if (UndefElts[i])
         continue;
       if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
         SplatIndex = -1;
         break;
       }
       SplatIndex = i;
     }
     if (0 <= SplatIndex) {
       SplatVal = EltBits[SplatIndex];
       return true;
     }
   }
 
   return false;
 }
 
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
                                         SmallVectorImpl<uint64_t> &RawMask,
                                         APInt &UndefElts) {
   // Extract the raw target constant bits.
   SmallVector<APInt, 64> EltBits;
   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
                                      EltBits, /* AllowWholeUndefs */ true,
                                      /* AllowPartialUndefs */ false))
     return false;
 
   // Insert the extracted elements into the mask.
   for (APInt Elt : EltBits)
     RawMask.push_back(Elt.getZExtValue());
 
   return true;
 }
 
 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
 /// Note: This ignores saturation, so inputs must be checked first.
 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
                                   bool Unary) {
   assert(Mask.empty() && "Expected an empty shuffle mask vector");
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumLanes = VT.getSizeInBits() / 128;
   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
   unsigned Offset = Unary ? 0 : NumElts;
 
   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
     for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
       Mask.push_back(Elt + (Lane * NumEltsPerLane));
     for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
       Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
   }
 }
 
 // Split the demanded elts of a PACKSS/PACKUS node between its operands.
 static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
                                 APInt &DemandedLHS, APInt &DemandedRHS) {
   int NumLanes = VT.getSizeInBits() / 128;
   int NumElts = DemandedElts.getBitWidth();
   int NumInnerElts = NumElts / 2;
   int NumEltsPerLane = NumElts / NumLanes;
   int NumInnerEltsPerLane = NumInnerElts / NumLanes;
 
   DemandedLHS = APInt::getNullValue(NumInnerElts);
   DemandedRHS = APInt::getNullValue(NumInnerElts);
 
   // Map DemandedElts to the packed operands.
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
     for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
       int OuterIdx = (Lane * NumEltsPerLane) + Elt;
       int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
       if (DemandedElts[OuterIdx])
         DemandedLHS.setBit(InnerIdx);
       if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
         DemandedRHS.setBit(InnerIdx);
     }
   }
 }
 
 /// Calculates the shuffle mask corresponding to the target-specific opcode.
 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
 /// operands in \p Ops, and returns true.
 /// Sets \p IsUnary to true if only one source is used. Note that this will set
 /// IsUnary for shuffles which use a single input multiple times, and in those
 /// cases it will adjust the mask to only have indices within that single input.
 /// It is an error to call this with non-empty Mask/Ops vectors.
 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
                                  SmallVectorImpl<SDValue> &Ops,
                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
   unsigned NumElems = VT.getVectorNumElements();
   unsigned MaskEltSize = VT.getScalarSizeInBits();
   SmallVector<uint64_t, 32> RawMask;
   APInt RawUndefs;
   SDValue ImmN;
 
   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
 
   IsUnary = false;
   bool IsFakeUnary = false;
   switch (N->getOpcode()) {
   case X86ISD::BLENDI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeSHUFPMask(NumElems, MaskEltSize,
                     cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::INSERTPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::EXTRQI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     if (isa<ConstantSDNode>(N->getOperand(1)) &&
         isa<ConstantSDNode>(N->getOperand(2))) {
       int BitLen = N->getConstantOperandVal(1);
       int BitIdx = N->getConstantOperandVal(2);
       DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
       IsUnary = true;
     }
     break;
   case X86ISD::INSERTQI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     if (isa<ConstantSDNode>(N->getOperand(2)) &&
         isa<ConstantSDNode>(N->getOperand(3))) {
       int BitLen = N->getConstantOperandVal(2);
       int BitIdx = N->getConstantOperandVal(3);
       DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     }
     break;
   case X86ISD::UNPCKH:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::UNPCKL:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVHLPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeMOVHLPSMask(NumElems, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVLHPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeMOVLHPSMask(NumElems, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::PALIGNR:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     Ops.push_back(N->getOperand(1));
     Ops.push_back(N->getOperand(0));
     break;
   case X86ISD::VSHLDQ:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                      Mask);
     IsUnary = true;
     break;
   case X86ISD::VSRLDQ:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                      Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSHUFMask(NumElems, MaskEltSize,
                     cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFHW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFLW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       Mask);
     IsUnary = true;
     break;
   case X86ISD::VZEXT_MOVL:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     DecodeZeroMoveLowMask(NumElems, Mask);
     IsUnary = true;
     break;
   case X86ISD::VBROADCAST: {
     SDValue N0 = N->getOperand(0);
     // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
     // add the pre-extracted value to the Ops vector.
     if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
         N0.getOperand(0).getValueType() == VT &&
         N0.getConstantOperandVal(1) == 0)
       Ops.push_back(N0.getOperand(0));
 
     // We only decode broadcasts of same-sized vectors, unless the broadcast
     // came from an extract from the original width. If we found one, we
     // pushed it the Ops vector above.
     if (N0.getValueType() == VT || !Ops.empty()) {
       DecodeVectorBroadcast(NumElems, Mask);
       IsUnary = true;
       break;
     }
     return false;
   }
   case X86ISD::VPERMILPV: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
                                     RawUndefs)) {
       DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
       break;
     }
     return false;
   }
   case X86ISD::PSHUFB: {
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     SDValue MaskNode = N->getOperand(1);
     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
       DecodePSHUFBMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
     break;
   case X86ISD::VPERM2X128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                          Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUF128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     ImmN = N->getOperand(N->getNumOperands() - 1);
     decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
                               cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVSLDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     DecodeMOVSLDUPMask(NumElems, Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSHDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     DecodeMOVSHDUPMask(NumElems, Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVDDUP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     DecodeMOVDDUPMask(NumElems, Mask);
     IsUnary = true;
     break;
   case X86ISD::VPERMIL2: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     SDValue MaskNode = N->getOperand(2);
     SDValue CtrlNode = N->getOperand(3);
     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
       unsigned CtrlImm = CtrlOp->getZExtValue();
       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
                                       RawUndefs)) {
         DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
                             Mask);
         break;
       }
     }
     return false;
   }
   case X86ISD::VPPERM: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     SDValue MaskNode = N->getOperand(2);
     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
       DecodeVPPERMMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMV: {
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     IsUnary = true;
     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
     Ops.push_back(N->getOperand(1));
     SDValue MaskNode = N->getOperand(0);
     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
                                     RawUndefs)) {
       DecodeVPERMVMask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
   }
   case X86ISD::VPERMV3: {
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
     Ops.push_back(N->getOperand(0));
     Ops.push_back(N->getOperand(2));
     SDValue MaskNode = N->getOperand(1);
     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
                                     RawUndefs)) {
       DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
       break;
     }
     return false;
   }
   default: llvm_unreachable("unknown target shuffle node");
   }
 
   // Empty mask indicates the decode failed.
   if (Mask.empty())
     return false;
 
   // Check if we're getting a shuffle mask with zero'd elements.
   if (!AllowSentinelZero)
     if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
       return false;
 
   // If we have a fake unary shuffle, the shuffle mask is spread across two
   // inputs that are actually the same node. Re-map the mask to always point
   // into the first input.
   if (IsFakeUnary)
     for (int &M : Mask)
       if (M >= (int)Mask.size())
         M -= Mask.size();
 
   // If we didn't already add operands in the opcode-specific code, default to
   // adding 1 or 2 operands starting at 0.
   if (Ops.empty()) {
     Ops.push_back(N->getOperand(0));
     if (!IsUnary || IsFakeUnary)
       Ops.push_back(N->getOperand(1));
   }
 
   return true;
 }
 
 /// Check a target shuffle mask's inputs to see if we can set any values to
 /// SM_SentinelZero - this is for elements that are known to be zero
 /// (not just zeroable) from their inputs.
 /// Returns true if the target shuffle mask was decoded.
 static bool setTargetShuffleZeroElements(SDValue N,
                                          SmallVectorImpl<int> &Mask,
                                          SmallVectorImpl<SDValue> &Ops) {
   bool IsUnary;
   if (!isTargetShuffle(N.getOpcode()))
     return false;
 
   MVT VT = N.getSimpleValueType();
   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
     return false;
 
   SDValue V1 = Ops[0];
   SDValue V2 = IsUnary ? V1 : Ops[1];
 
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
   assert((VT.getSizeInBits() % Mask.size()) == 0 &&
          "Illegal split of shuffle value type");
   unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
 
   // Extract known constant input data.
   APInt UndefSrcElts[2];
   SmallVector<APInt, 32> SrcEltBits[2];
   bool IsSrcConstant[2] = {
       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
                                     SrcEltBits[0], true, false),
       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
                                     SrcEltBits[1], true, false)};
 
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     int M = Mask[i];
 
     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
     if (M < 0)
       continue;
 
     // Determine shuffle input and normalize the mask.
     unsigned SrcIdx = M / Size;
     SDValue V = M < Size ? V1 : V2;
     M %= Size;
 
     // We are referencing an UNDEF input.
     if (V.isUndef()) {
       Mask[i] = SM_SentinelUndef;
       continue;
     }
 
     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
     // TODO: We currently only set UNDEF for integer types - floats use the same
     // registers as vectors and many of the scalar folded loads rely on the
     // SCALAR_TO_VECTOR pattern.
     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
         (Size % V.getValueType().getVectorNumElements()) == 0) {
       int Scale = Size / V.getValueType().getVectorNumElements();
       int Idx = M / Scale;
       if (Idx != 0 && !VT.isFloatingPoint())
         Mask[i] = SM_SentinelUndef;
       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
         Mask[i] = SM_SentinelZero;
       continue;
     }
 
     // Attempt to extract from the source's constant bits.
     if (IsSrcConstant[SrcIdx]) {
       if (UndefSrcElts[SrcIdx][M])
         Mask[i] = SM_SentinelUndef;
       else if (SrcEltBits[SrcIdx][M] == 0)
         Mask[i] = SM_SentinelZero;
     }
   }
 
   assert(VT.getVectorNumElements() == Mask.size() &&
          "Different mask size from vector size!");
   return true;
 }
 
 // Forward declaration (for getFauxShuffleMask recursive check).
 static bool resolveTargetShuffleInputs(SDValue Op,
                                        SmallVectorImpl<SDValue> &Inputs,
                                        SmallVectorImpl<int> &Mask,
                                        const SelectionDAG &DAG);
 
 // Attempt to decode ops that could be represented as a shuffle mask.
 // The decoded shuffle mask may contain a different number of elements to the
 // destination value type.
 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
                                SmallVectorImpl<SDValue> &Ops,
                                const SelectionDAG &DAG) {
   Mask.clear();
   Ops.clear();
 
   MVT VT = N.getSimpleValueType();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumSizeInBits = VT.getSizeInBits();
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
   assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
          "Expected byte aligned value types");
 
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
   case ISD::VECTOR_SHUFFLE: {
     // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
     if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
       Mask.append(ShuffleMask.begin(), ShuffleMask.end());
       Ops.push_back(N.getOperand(0));
       Ops.push_back(N.getOperand(1));
       return true;
     }
     return false;
   }
   case ISD::AND:
   case X86ISD::ANDNP: {
     // Attempt to decode as a per-byte mask.
     APInt UndefElts;
     SmallVector<APInt, 32> EltBits;
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
     bool IsAndN = (X86ISD::ANDNP == Opcode);
     uint64_t ZeroMask = IsAndN ? 255 : 0;
     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
       return false;
     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
       if (UndefElts[i]) {
         Mask.push_back(SM_SentinelUndef);
         continue;
       }
       uint64_t ByteBits = EltBits[i].getZExtValue();
       if (ByteBits != 0 && ByteBits != 255)
         return false;
       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
     }
     Ops.push_back(IsAndN ? N1 : N0);
     return true;
   }
   case ISD::OR: {
     // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
     // is a valid shuffle index.
     SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
     SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
     if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
       return false;
     SmallVector<int, 64> SrcMask0, SrcMask1;
     SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
     if (!resolveTargetShuffleInputs(N0, SrcInputs0, SrcMask0, DAG) ||
         !resolveTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG))
       return false;
     int MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
     scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
     scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
     for (int i = 0; i != MaskSize; ++i) {
       if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
         Mask.push_back(SM_SentinelUndef);
       else if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
         Mask.push_back(SM_SentinelZero);
       else if (Mask1[i] == SM_SentinelZero)
         Mask.push_back(Mask0[i]);
       else if (Mask0[i] == SM_SentinelZero)
         Mask.push_back(Mask1[i] + (MaskSize * SrcInputs0.size()));
       else
         return false;
     }
     for (SDValue &Op : SrcInputs0)
       Ops.push_back(Op);
     for (SDValue &Op : SrcInputs1)
       Ops.push_back(Op);
     return true;
   }
   case ISD::INSERT_SUBVECTOR: {
     // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(EXTRACT_SUBVECTOR(SRC1)) where
     // SRC0/SRC1 are both of the same valuetype VT.
     // TODO - add peekThroughOneUseBitcasts support.
     SDValue Src = N.getOperand(0);
     SDValue Sub = N.getOperand(1);
     EVT SubVT = Sub.getValueType();
     unsigned NumSubElts = SubVT.getVectorNumElements();
     if (!isa<ConstantSDNode>(N.getOperand(2)) ||
         !N->isOnlyUserOf(Sub.getNode()))
       return false;
     SmallVector<int, 64> SubMask;
     SmallVector<SDValue, 2> SubInputs;
     if (!resolveTargetShuffleInputs(Sub, SubInputs, SubMask, DAG) ||
         SubMask.size() != NumSubElts)
       return false;
     Ops.push_back(Src);
     for (SDValue &SubInput : SubInputs) {
       if (SubInput.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
           SubInput.getOperand(0).getValueType() != VT ||
           !isa<ConstantSDNode>(SubInput.getOperand(1)))
         return false;
       Ops.push_back(SubInput.getOperand(0));
     }
     int InsertIdx = N.getConstantOperandVal(2);
     for (int i = 0; i != (int)NumElts; ++i)
       Mask.push_back(i);
     for (int i = 0; i != (int)NumSubElts; ++i) {
       int M = SubMask[i];
       if (0 <= M) {
         int InputIdx = M / NumSubElts;
         int ExtractIdx = SubInputs[InputIdx].getConstantOperandVal(1);
         M = (NumElts * (1 + InputIdx)) + ExtractIdx + (M % NumSubElts);
       }
       Mask[i + InsertIdx] = M;
     }
     return true;
   }
   case ISD::SCALAR_TO_VECTOR: {
     // Match against a scalar_to_vector of an extract from a vector,
     // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
     SDValue N0 = N.getOperand(0);
     SDValue SrcExtract;
 
     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
          N0.getOperand(0).getValueType() == VT) ||
         (N0.getOpcode() == X86ISD::PEXTRW &&
          N0.getOperand(0).getValueType() == MVT::v8i16) ||
         (N0.getOpcode() == X86ISD::PEXTRB &&
          N0.getOperand(0).getValueType() == MVT::v16i8)) {
       SrcExtract = N0;
     }
 
     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
       return false;
 
     SDValue SrcVec = SrcExtract.getOperand(0);
     EVT SrcVT = SrcVec.getValueType();
     unsigned NumSrcElts = SrcVT.getVectorNumElements();
     unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
 
     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
     if (NumSrcElts <= SrcIdx)
       return false;
 
     Ops.push_back(SrcVec);
     Mask.push_back(SrcIdx);
     Mask.append(NumZeros, SM_SentinelZero);
     Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
     return true;
   }
   case X86ISD::PINSRB:
   case X86ISD::PINSRW: {
     SDValue InVec = N.getOperand(0);
     SDValue InScl = N.getOperand(1);
     SDValue InIndex = N.getOperand(2);
     if (!isa<ConstantSDNode>(InIndex) ||
         cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
       return false;
     uint64_t InIdx = N.getConstantOperandVal(2);
 
     // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
     if (X86::isZeroNode(InScl)) {
       Ops.push_back(InVec);
       for (unsigned i = 0; i != NumElts; ++i)
         Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
       return true;
     }
 
     // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
     // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
     unsigned ExOp =
         (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
     if (InScl.getOpcode() != ExOp)
       return false;
 
     SDValue ExVec = InScl.getOperand(0);
     SDValue ExIndex = InScl.getOperand(1);
     if (!isa<ConstantSDNode>(ExIndex) ||
         cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
       return false;
     uint64_t ExIdx = InScl.getConstantOperandVal(1);
 
     Ops.push_back(InVec);
     Ops.push_back(ExVec);
     for (unsigned i = 0; i != NumElts; ++i)
       Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
     return true;
   }
   case X86ISD::PACKSS:
   case X86ISD::PACKUS: {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
     assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
            N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
            "Unexpected input value type");
 
     // If we know input saturation won't happen we can treat this
     // as a truncation shuffle.
     if (Opcode == X86ISD::PACKSS) {
       if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
           (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
         return false;
     } else {
       APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
       if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
           (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
         return false;
     }
 
     bool IsUnary = (N0 == N1);
 
     Ops.push_back(N0);
     if (!IsUnary)
       Ops.push_back(N1);
 
     createPackShuffleMask(VT, Mask, IsUnary);
     return true;
   }
   case X86ISD::VSHLI:
   case X86ISD::VSRLI: {
     uint64_t ShiftVal = N.getConstantOperandVal(1);
     // Out of range bit shifts are guaranteed to be zero.
     if (NumBitsPerElt <= ShiftVal) {
       Mask.append(NumElts, SM_SentinelZero);
       return true;
     }
 
     // We can only decode 'whole byte' bit shifts as shuffles.
     if ((ShiftVal % 8) != 0)
       break;
 
     uint64_t ByteShift = ShiftVal / 8;
     unsigned NumBytes = NumSizeInBits / 8;
     unsigned NumBytesPerElt = NumBitsPerElt / 8;
     Ops.push_back(N.getOperand(0));
 
     // Clear mask to all zeros and insert the shifted byte indices.
     Mask.append(NumBytes, SM_SentinelZero);
 
     if (X86ISD::VSHLI == Opcode) {
       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
           Mask[i + j] = i + j - ByteShift;
     } else {
       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
           Mask[i + j - ByteShift] = i + j;
     }
     return true;
   }
   case ISD::ZERO_EXTEND_VECTOR_INREG:
   case ISD::ZERO_EXTEND: {
     // TODO - add support for VPMOVZX with smaller input vector types.
     SDValue Src = N.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
     if (NumSizeInBits != SrcVT.getSizeInBits())
       break;
     DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
                          Mask);
     Ops.push_back(Src);
     return true;
   }
   }
 
   return false;
 }
 
 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
                                               SmallVectorImpl<int> &Mask) {
   int MaskWidth = Mask.size();
   SmallVector<SDValue, 16> UsedInputs;
   for (int i = 0, e = Inputs.size(); i < e; ++i) {
     int lo = UsedInputs.size() * MaskWidth;
     int hi = lo + MaskWidth;
 
     // Strip UNDEF input usage.
     if (Inputs[i].isUndef())
       for (int &M : Mask)
         if ((lo <= M) && (M < hi))
           M = SM_SentinelUndef;
 
     // Check for unused inputs.
     if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
       UsedInputs.push_back(Inputs[i]);
       continue;
     }
     for (int &M : Mask)
       if (lo <= M)
         M -= MaskWidth;
   }
   Inputs = UsedInputs;
 }
 
 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
 /// remaining input indices in case we now have a unary shuffle and adjust the
 /// inputs accordingly.
 /// Returns true if the target shuffle mask was decoded.
 static bool resolveTargetShuffleInputs(SDValue Op,
                                        SmallVectorImpl<SDValue> &Inputs,
                                        SmallVectorImpl<int> &Mask,
                                        const SelectionDAG &DAG) {
   if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
     if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
       return false;
 
   resolveTargetShuffleInputsAndMask(Inputs, Mask);
   return true;
 }
 
 /// Returns the scalar element that will make up the ith
 /// element of the result of the vector shuffle.
 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
                                    unsigned Depth) {
   if (Depth == 6)
     return SDValue();  // Limit search depth.
 
   SDValue V = SDValue(N, 0);
   EVT VT = V.getValueType();
   unsigned Opcode = V.getOpcode();
 
   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
     int Elt = SV->getMaskElt(Index);
 
     if (Elt < 0)
       return DAG.getUNDEF(VT.getVectorElementType());
 
     unsigned NumElems = VT.getVectorNumElements();
     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
                                          : SV->getOperand(1);
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
   }
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
     MVT ShufVT = V.getSimpleValueType();
     MVT ShufSVT = ShufVT.getVectorElementType();
     int NumElems = (int)ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     SmallVector<SDValue, 16> ShuffleOps;
     bool IsUnary;
 
     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
       return SDValue();
 
     int Elt = ShuffleMask[Index];
     if (Elt == SM_SentinelZero)
       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
     if (Elt == SM_SentinelUndef)
       return DAG.getUNDEF(ShufSVT);
 
     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
                                Depth+1);
   }
 
   // Actual nodes that may contain scalar elements
   if (Opcode == ISD::BITCAST) {
     V = V.getOperand(0);
     EVT SrcVT = V.getValueType();
     unsigned NumElems = VT.getVectorNumElements();
 
     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
       return SDValue();
   }
 
   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
     return (Index == 0) ? V.getOperand(0)
                         : DAG.getUNDEF(VT.getVectorElementType());
 
   if (V.getOpcode() == ISD::BUILD_VECTOR)
     return V.getOperand(Index);
 
   return SDValue();
 }
 
 // Use PINSRB/PINSRW/PINSRD to create a build vector.
 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
                                         unsigned NumNonZero, unsigned NumZero,
                                         SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumElts = VT.getVectorNumElements();
   assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
           ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
          "Illegal vector insertion");
 
   SDLoc dl(Op);
   SDValue V;
   bool First = true;
 
   for (unsigned i = 0; i < NumElts; ++i) {
     bool IsNonZero = (NonZeros & (1 << i)) != 0;
     if (!IsNonZero)
       continue;
 
     // If the build vector contains zeros or our first insertion is not the
     // first index then insert into zero vector to break any register
     // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
     if (First) {
       First = false;
       if (NumZero || 0 != i)
         V = getZeroVector(VT, Subtarget, DAG, dl);
       else {
         assert(0 == i && "Expected insertion into zero-index");
         V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
         V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
         V = DAG.getBitcast(VT, V);
         continue;
       }
     }
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
                     DAG.getIntPtrConstant(i, dl));
   }
 
   return V;
 }
 
 /// Custom lower build_vector of v16i8.
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   if (NumNonZero > 8 && !Subtarget.hasSSE41())
     return SDValue();
 
   // SSE4.1 - use PINSRB to insert each byte directly.
   if (Subtarget.hasSSE41())
     return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
                                     Subtarget);
 
   SDLoc dl(Op);
   SDValue V;
   bool First = true;
 
   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
   for (unsigned i = 0; i < 16; ++i) {
     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
     if (ThisIsNonZero && First) {
       if (NumZero)
         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
       else
         V = DAG.getUNDEF(MVT::v8i16);
       First = false;
     }
 
     if ((i & 1) != 0) {
       // FIXME: Investigate extending to i32 instead of just i16.
       // FIXME: Investigate combining the first 4 bytes as a i32 instead.
       SDValue ThisElt, LastElt;
       bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
       if (LastIsNonZero) {
         LastElt =
             DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
       }
       if (ThisIsNonZero) {
         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
                               DAG.getConstant(8, dl, MVT::i8));
         if (LastIsNonZero)
           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
       } else
         ThisElt = LastElt;
 
       if (ThisElt) {
         if (1 == i) {
           V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
                       : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
           V = DAG.getBitcast(MVT::v8i16, V);
         } else {
           V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
                           DAG.getIntPtrConstant(i / 2, dl));
         }
       }
     }
   }
 
   return DAG.getBitcast(MVT::v16i8, V);
 }
 
 /// Custom lower build_vector of v8i16.
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   if (NumNonZero > 4 && !Subtarget.hasSSE41())
     return SDValue();
 
   // Use PINSRW to insert each byte directly.
   return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
                                   Subtarget);
 }
 
 /// Custom lower build_vector of v4i32 or v4f32.
 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   // If this is a splat of a pair of elements, use MOVDDUP (unless the target
   // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
   // Because we're creating a less complicated build vector here, we may enable
   // further folding of the MOVDDUP via shuffle transforms.
   if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
       Op.getOperand(0) == Op.getOperand(2) &&
       Op.getOperand(1) == Op.getOperand(3) &&
       Op.getOperand(0) != Op.getOperand(1)) {
     SDLoc DL(Op);
     MVT VT = Op.getSimpleValueType();
     MVT EltVT = VT.getVectorElementType();
     // Create a new build vector with the first 2 elements followed by undef
     // padding, bitcast to v2f64, duplicate, and bitcast back.
     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
     SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
     SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
     return DAG.getBitcast(VT, Dup);
   }
 
   // Find all zeroable elements.
   std::bitset<4> Zeroable;
   for (int i=0; i < 4; ++i) {
     SDValue Elt = Op->getOperand(i);
     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
   }
   assert(Zeroable.size() - Zeroable.count() > 1 &&
          "We expect at least two non-zero elements!");
 
   // We only know how to deal with build_vector nodes where elements are either
   // zeroable or extract_vector_elt with constant index.
   SDValue FirstNonZero;
   unsigned FirstNonZeroIdx;
   for (unsigned i=0; i < 4; ++i) {
     if (Zeroable[i])
       continue;
     SDValue Elt = Op->getOperand(i);
     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         !isa<ConstantSDNode>(Elt.getOperand(1)))
       return SDValue();
     // Make sure that this node is extracting from a 128-bit vector.
     MVT VT = Elt.getOperand(0).getSimpleValueType();
     if (!VT.is128BitVector())
       return SDValue();
     if (!FirstNonZero.getNode()) {
       FirstNonZero = Elt;
       FirstNonZeroIdx = i;
     }
   }
 
   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
   SDValue V1 = FirstNonZero.getOperand(0);
   MVT VT = V1.getSimpleValueType();
 
   // See if this build_vector can be lowered as a blend with zero.
   SDValue Elt;
   unsigned EltMaskIdx, EltIdx;
   int Mask[4];
   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
     if (Zeroable[EltIdx]) {
       // The zero vector will be on the right hand side.
       Mask[EltIdx] = EltIdx+4;
       continue;
     }
 
     Elt = Op->getOperand(EltIdx);
     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
     EltMaskIdx = Elt.getConstantOperandVal(1);
     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
       break;
     Mask[EltIdx] = EltIdx;
   }
 
   if (EltIdx == 4) {
     // Let the shuffle legalizer deal with blend operations.
     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
     if (V1.getSimpleValueType() != VT)
       V1 = DAG.getBitcast(VT, V1);
     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
   }
 
   // See if we can lower this build_vector to a INSERTPS.
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   SDValue V2 = Elt.getOperand(0);
   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
     V1 = SDValue();
 
   bool CanFold = true;
   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
     if (Zeroable[i])
       continue;
 
     SDValue Current = Op->getOperand(i);
     SDValue SrcVector = Current->getOperand(0);
     if (!V1.getNode())
       V1 = SrcVector;
     CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
   }
 
   if (!CanFold)
     return SDValue();
 
   assert(V1.getNode() && "Expected at least two non-zero elements!");
   if (V1.getSimpleValueType() != MVT::v4f32)
     V1 = DAG.getBitcast(MVT::v4f32, V1);
   if (V2.getSimpleValueType() != MVT::v4f32)
     V2 = DAG.getBitcast(MVT::v4f32, V2);
 
   // Ok, we can emit an INSERTPS instruction.
   unsigned ZMask = Zeroable.to_ulong();
 
   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
   SDLoc DL(Op);
   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                                DAG.getIntPtrConstant(InsertPSMask, DL));
   return DAG.getBitcast(VT, Result);
 }
 
 /// Return a vector logical shift node.
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
                          SelectionDAG &DAG, const TargetLowering &TLI,
                          const SDLoc &dl) {
   assert(VT.is128BitVector() && "Unknown type for VShift");
   MVT ShVT = MVT::v16i8;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
   SrcOp = DAG.getBitcast(ShVT, SrcOp);
   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
 }
 
 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
                                       SelectionDAG &DAG) {
 
   // Check if the scalar load can be widened into a vector load. And if
   // the address is "base + cst" see if the cst can be "absorbed" into
   // the shuffle mask.
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
     SDValue Ptr = LD->getBasePtr();
     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
       return SDValue();
     EVT PVT = LD->getValueType(0);
     if (PVT != MVT::i32 && PVT != MVT::f32)
       return SDValue();
 
     int FI = -1;
     int64_t Offset = 0;
     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
       FI = FINode->getIndex();
       Offset = 0;
     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
       Offset = Ptr.getConstantOperandVal(1);
       Ptr = Ptr.getOperand(0);
     } else {
       return SDValue();
     }
 
     // FIXME: 256-bit vector instructions don't require a strict alignment,
     // improve this code to support it better.
     unsigned RequiredAlign = VT.getSizeInBits()/8;
     SDValue Chain = LD->getChain();
     // Make sure the stack object alignment is at least 16 or 32.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
       if (MFI.isFixedObjectIndex(FI)) {
         // Can't change the alignment. FIXME: It's possible to compute
         // the exact stack offset and reference FI + adjust offset instead.
         // If someone *really* cares about this. That's the way to implement it.
         return SDValue();
       } else {
         MFI.setObjectAlignment(FI, RequiredAlign);
       }
     }
 
     // (Offset % 16 or 32) must be multiple of 4. Then address is then
     // Ptr + (Offset & ~15).
     if (Offset < 0)
       return SDValue();
     if ((Offset % RequiredAlign) & 3)
       return SDValue();
     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
     if (StartOffset) {
       SDLoc DL(Ptr);
       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
     }
 
     int EltNo = (Offset - StartOffset) >> 2;
     unsigned NumElems = VT.getVectorNumElements();
 
     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
                              LD->getPointerInfo().getWithOffset(StartOffset));
 
     SmallVector<int, 8> Mask(NumElems, EltNo);
 
     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
   }
 
   return SDValue();
 }
 
 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
 /// elements can be replaced by a single large load which has the same value as
 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
 ///
 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
                                         const SDLoc &DL, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget,
                                         bool isAfterLegalize) {
   unsigned NumElems = Elts.size();
 
   int LastLoadedElt = -1;
   SmallBitVector LoadMask(NumElems, false);
   SmallBitVector ZeroMask(NumElems, false);
   SmallBitVector UndefMask(NumElems, false);
 
   // For each element in the initializer, see if we've found a load, zero or an
   // undef.
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = peekThroughBitcasts(Elts[i]);
     if (!Elt.getNode())
       return SDValue();
 
     if (Elt.isUndef())
       UndefMask[i] = true;
     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
       ZeroMask[i] = true;
     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
       LoadMask[i] = true;
       LastLoadedElt = i;
       // Each loaded element must be the correct fractional portion of the
       // requested vector load.
       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
         return SDValue();
     } else
       return SDValue();
   }
   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
          "Incomplete element masks");
 
   // Handle Special Cases - all undef or undef/zero.
   if (UndefMask.count() == NumElems)
     return DAG.getUNDEF(VT);
 
   // FIXME: Should we return this as a BUILD_VECTOR instead?
   if ((ZeroMask | UndefMask).count() == NumElems)
     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
                           : DAG.getConstantFP(0.0, DL, VT);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   int FirstLoadedElt = LoadMask.find_first();
   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
   EVT LDBaseVT = EltBase.getValueType();
 
   // Consecutive loads can contain UNDEFS but not ZERO elements.
   // Consecutive loads with UNDEFs and ZEROs elements require a
   // an additional shuffle stage to clear the ZERO elements.
   bool IsConsecutiveLoad = true;
   bool IsConsecutiveLoadWithZeros = true;
   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
     if (LoadMask[i]) {
       SDValue Elt = peekThroughBitcasts(Elts[i]);
       LoadSDNode *LD = cast<LoadSDNode>(Elt);
       if (!DAG.areNonVolatileConsecutiveLoads(
               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
               i - FirstLoadedElt)) {
         IsConsecutiveLoad = false;
         IsConsecutiveLoadWithZeros = false;
         break;
       }
     } else if (ZeroMask[i]) {
       IsConsecutiveLoad = false;
     }
   }
 
   SmallVector<LoadSDNode *, 8> Loads;
   for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
     if (LoadMask[i])
       Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
 
   auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
     auto MMOFlags = LDBase->getMemOperand()->getFlags();
     assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
            "Cannot merge volatile loads.");
     SDValue NewLd =
         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
     for (auto *LD : Loads)
       DAG.makeEquivalentMemoryOrdering(LD, NewLd);
     return NewLd;
   };
 
   // LOAD - all consecutive load/undefs (must start/end with a load).
   // If we have found an entire vector of loads and undefs, then return a large
   // load of the entire vector width starting at the base pointer.
   // If the vector contains zeros, then attempt to shuffle those elements.
   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
     assert(LDBase && "Did not find base load for merging consecutive loads");
     EVT EltVT = LDBase->getValueType(0);
     // Ensure that the input vector size for the merged loads matches the
     // cumulative size of the input elements.
     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
       return SDValue();
 
     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
       return SDValue();
 
     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
     // will lower to regular temporal loads and use the cache.
     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
         VT.is256BitVector() && !Subtarget.hasInt256())
       return SDValue();
 
     if (IsConsecutiveLoad)
       return CreateLoad(VT, LDBase);
 
     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
     // vector and a zero vector to clear out the zero elements.
     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
       SmallVector<int, 4> ClearMask(NumElems, -1);
       for (unsigned i = 0; i < NumElems; ++i) {
         if (ZeroMask[i])
           ClearMask[i] = i + NumElems;
         else if (LoadMask[i])
           ClearMask[i] = i;
       }
       SDValue V = CreateLoad(VT, LDBase);
       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
                                  : DAG.getConstantFP(0.0, DL, VT);
       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
     }
   }
 
   int LoadSize =
       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
 
   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
       (LoadSize == 32 || LoadSize == 64) &&
       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
                                       : MVT::getIntegerVT(LoadSize);
     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
     if (TLI.isTypeLegal(VecVT)) {
       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
       SDValue ResNode =
           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
                                   LDBase->getPointerInfo(),
                                   LDBase->getAlignment(),
                                   MachineMemOperand::MOLoad);
       for (auto *LD : Loads)
         DAG.makeEquivalentMemoryOrdering(LD, ResNode);
       return DAG.getBitcast(VT, ResNode);
     }
   }
 
   return SDValue();
 }
 
 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
                                    unsigned SplatBitSize, LLVMContext &C) {
   unsigned ScalarSize = VT.getScalarSizeInBits();
   unsigned NumElm = SplatBitSize / ScalarSize;
 
   SmallVector<Constant *, 32> ConstantVec;
   for (unsigned i = 0; i < NumElm; i++) {
     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
     Constant *Const;
     if (VT.isFloatingPoint()) {
       if (ScalarSize == 32) {
         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
       } else {
         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
       }
     } else
       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
     ConstantVec.push_back(Const);
   }
   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
 }
 
 static bool isUseOfShuffle(SDNode *N) {
   for (auto *U : N->uses()) {
     if (isTargetShuffle(U->getOpcode()))
       return true;
     if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
       return isUseOfShuffle(U);
   }
   return false;
 }
 
 // Check if the current node of build vector is a zero extended vector.
 // // If so, return the value extended.
 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
 // // NumElt - return the number of zero extended identical values.
 // // EltType - return the type of the value include the zero extend.
 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
                                    unsigned &NumElt, MVT &EltType) {
   SDValue ExtValue = Op->getOperand(0);
   unsigned NumElts = Op->getNumOperands();
   unsigned Delta = NumElts;
 
   for (unsigned i = 1; i < NumElts; i++) {
     if (Op->getOperand(i) == ExtValue) {
       Delta = i;
       break;
     }
     if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
       return SDValue();
   }
   if (!isPowerOf2_32(Delta) || Delta == 1)
     return SDValue();
 
   for (unsigned i = Delta; i < NumElts; i++) {
     if (i % Delta == 0) {
       if (Op->getOperand(i) != ExtValue)
         return SDValue();
     } else if (!(isNullConstant(Op->getOperand(i)) ||
                  Op->getOperand(i).isUndef()))
       return SDValue();
   }
   unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
   unsigned ExtVTSize = EltSize * Delta;
   EltType = MVT::getIntegerVT(ExtVTSize);
   NumElt = NumElts / Delta;
   return ExtValue;
 }
 
 /// Attempt to use the vbroadcast instruction to generate a splat value
 /// from a splat BUILD_VECTOR which uses:
 ///  a. A single scalar load, or a constant.
 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
 ///
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   // VBROADCAST requires AVX.
   // TODO: Splats could be generated for non-AVX CPUs using SSE
   // instructions, but there's less potential gain for only 128-bit vectors.
   if (!Subtarget.hasAVX())
     return SDValue();
 
   MVT VT = BVOp->getSimpleValueType(0);
   SDLoc dl(BVOp);
 
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Unsupported vector type for broadcast.");
 
   BitVector UndefElements;
   SDValue Ld = BVOp->getSplatValue(&UndefElements);
 
   // Attempt to use VBROADCASTM
   // From this paterrn:
   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
   // b. t1 = (build_vector t0 t0)
   //
   // Create (VBROADCASTM v2i1 X)
   if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
     MVT EltType = VT.getScalarType();
     unsigned NumElts = VT.getVectorNumElements();
     SDValue BOperand;
     SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
     if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
         (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
          Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
       if (ZeroExtended)
         BOperand = ZeroExtended.getOperand(0);
       else
         BOperand = Ld.getOperand(0).getOperand(0);
       MVT MaskVT = BOperand.getSimpleValueType();
       if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
           (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
         SDValue Brdcst =
             DAG.getNode(X86ISD::VBROADCASTM, dl,
                         MVT::getVectorVT(EltType, NumElts), BOperand);
         return DAG.getBitcast(VT, Brdcst);
       }
     }
   }
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumUndefElts = UndefElements.count();
   if (!Ld || (NumElts - NumUndefElts) <= 1) {
     APInt SplatValue, Undef;
     unsigned SplatBitSize;
     bool HasUndef;
     // Check if this is a repeated constant pattern suitable for broadcasting.
     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
         SplatBitSize > VT.getScalarSizeInBits() &&
         SplatBitSize < VT.getSizeInBits()) {
       // Avoid replacing with broadcast when it's a use of a shuffle
       // instruction to preserve the present custom lowering of shuffles.
       if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
         return SDValue();
       // replace BUILD_VECTOR with broadcast of the repeated constants.
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       LLVMContext *Ctx = DAG.getContext();
       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
       if (Subtarget.hasAVX()) {
         if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
             !(SplatBitSize == 64 && Subtarget.is32Bit())) {
           // Splatted value can fit in one INTEGER constant in constant pool.
           // Load the constant and broadcast it.
           MVT CVT = MVT::getIntegerVT(SplatBitSize);
           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
           SDValue CP = DAG.getConstantPool(C, PVT);
           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
 
           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
           Ld = DAG.getLoad(
               CVT, dl, DAG.getEntryNode(), CP,
               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
               Alignment);
           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
                                        MVT::getVectorVT(CVT, Repeat), Ld);
           return DAG.getBitcast(VT, Brdcst);
         } else if (SplatBitSize == 32 || SplatBitSize == 64) {
           // Splatted value can fit in one FLOAT constant in constant pool.
           // Load the constant and broadcast it.
           // AVX have support for 32 and 64 bit broadcast for floats only.
           // No 64bit integer in 32bit subtarget.
           MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
           // Lower the splat via APFloat directly, to avoid any conversion.
           Constant *C =
               SplatBitSize == 32
                   ? ConstantFP::get(*Ctx,
                                     APFloat(APFloat::IEEEsingle(), SplatValue))
                   : ConstantFP::get(*Ctx,
                                     APFloat(APFloat::IEEEdouble(), SplatValue));
           SDValue CP = DAG.getConstantPool(C, PVT);
           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
 
           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
           Ld = DAG.getLoad(
               CVT, dl, DAG.getEntryNode(), CP,
               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
               Alignment);
           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
                                        MVT::getVectorVT(CVT, Repeat), Ld);
           return DAG.getBitcast(VT, Brdcst);
         } else if (SplatBitSize > 64) {
           // Load the vector of constants and broadcast it.
           MVT CVT = VT.getScalarType();
           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
                                              *Ctx);
           SDValue VCP = DAG.getConstantPool(VecC, PVT);
           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
           unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
           Ld = DAG.getLoad(
               MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
               Alignment);
           SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
           return DAG.getBitcast(VT, Brdcst);
         }
       }
     }
 
     // If we are moving a scalar into a vector (Ld must be set and all elements
     // but 1 are undef) and that operation is not obviously supported by
     // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
     // That's better than general shuffling and may eliminate a load to GPR and
     // move from scalar to vector register.
     if (!Ld || NumElts - NumUndefElts != 1)
       return SDValue();
     unsigned ScalarSize = Ld.getValueSizeInBits();
     if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
       return SDValue();
   }
 
   bool ConstSplatVal =
       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
 
   // Make sure that all of the users of a non-constant load are from the
   // BUILD_VECTOR node.
   if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
     return SDValue();
 
   unsigned ScalarSize = Ld.getValueSizeInBits();
   bool IsGE256 = (VT.getSizeInBits() >= 256);
 
   // When optimizing for size, generate up to 5 extra bytes for a broadcast
   // instruction to save 8 or more bytes of constant pool data.
   // TODO: If multiple splats are generated to load the same constant,
   // it may be detrimental to overall size. There needs to be a way to detect
   // that condition to know if this is truly a size win.
   bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
 
   // Handle broadcasting a single constant scalar from the constant pool
   // into a vector.
   // On Sandybridge (no AVX2), it is still better to load a constant vector
   // from the constant pool and not to broadcast it from a scalar.
   // But override that restriction when optimizing for size.
   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
     EVT CVT = Ld.getValueType();
     assert(!CVT.isVector() && "Must not broadcast a vector type");
 
     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
     // For size optimization, also splat v2f64 and v2i64, and for size opt
     // with AVX2, also splat i8 and i16.
     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
         C = CF->getConstantFPValue();
 
       assert(C && "Invalid constant type");
 
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       SDValue CP =
           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(
           CVT, dl, DAG.getEntryNode(), CP,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
           Alignment);
 
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
     }
   }
 
   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
 
   // Handle AVX2 in-register broadcasts.
   if (!IsLoad && Subtarget.hasInt256() &&
       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The scalar source must be a normal load.
   if (!IsLoad)
     return SDValue();
 
   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
       (Subtarget.hasVLX() && ScalarSize == 64))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   // double since there is no vbroadcastsd xmm
   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
   }
 
   // Unsupported broadcast.
   return SDValue();
 }
 
 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
 /// underlying vector and index.
 ///
 /// Modifies \p ExtractedFromVec to the real vector and returns the real
 /// index.
 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
                                          SDValue ExtIdx) {
   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
     return Idx;
 
   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
   // lowered this:
   //   (extract_vector_elt (v8f32 %1), Constant<6>)
   // to:
   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
   //                           (extract_subvector (v8f32 %0), Constant<4>),
   //                           undef)
   //                       Constant<0>)
   // In this case the vector is the extract_subvector expression and the index
   // is 2, as specified by the shuffle.
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
   SDValue ShuffleVec = SVOp->getOperand(0);
   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
   assert(ShuffleVecVT.getVectorElementType() ==
          ExtractedFromVec.getSimpleValueType().getVectorElementType());
 
   int ShuffleIdx = SVOp->getMaskElt(Idx);
   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
     ExtractedFromVec = ShuffleVec;
     return ShuffleIdx;
   }
   return Idx;
 }
 
 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   // Skip if insert_vec_elt is not supported.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
     return SDValue();
 
   SDLoc DL(Op);
   unsigned NumElems = Op.getNumOperands();
 
   SDValue VecIn1;
   SDValue VecIn2;
   SmallVector<unsigned, 4> InsertIndices;
   SmallVector<int, 8> Mask(NumElems, -1);
 
   for (unsigned i = 0; i != NumElems; ++i) {
     unsigned Opc = Op.getOperand(i).getOpcode();
 
     if (Opc == ISD::UNDEF)
       continue;
 
     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
       // Quit if more than 1 elements need inserting.
       if (InsertIndices.size() > 1)
         return SDValue();
 
       InsertIndices.push_back(i);
       continue;
     }
 
     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
 
     // Quit if non-constant index.
     if (!isa<ConstantSDNode>(ExtIdx))
       return SDValue();
     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
 
     // Quit if extracted from vector of different type.
     if (ExtractedFromVec.getValueType() != VT)
       return SDValue();
 
     if (!VecIn1.getNode())
       VecIn1 = ExtractedFromVec;
     else if (VecIn1 != ExtractedFromVec) {
       if (!VecIn2.getNode())
         VecIn2 = ExtractedFromVec;
       else if (VecIn2 != ExtractedFromVec)
         // Quit if more than 2 vectors to shuffle
         return SDValue();
     }
 
     if (ExtractedFromVec == VecIn1)
       Mask[i] = Idx;
     else if (ExtractedFromVec == VecIn2)
       Mask[i] = Idx + NumElems;
   }
 
   if (!VecIn1.getNode())
     return SDValue();
 
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
 
   for (unsigned Idx : InsertIndices)
     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
                      DAG.getIntPtrConstant(Idx, DL));
 
   return NV;
 }
 
 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
          Op.getScalarValueSizeInBits() == 1 &&
          "Can not convert non-constant vector");
   uint64_t Immediate = 0;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (!In.isUndef())
       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
   }
   SDLoc dl(Op);
   MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
   return DAG.getConstant(Immediate, dl, VT);
 }
 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
 
   MVT VT = Op.getSimpleValueType();
   assert((VT.getVectorElementType() == MVT::i1) &&
          "Unexpected type in LowerBUILD_VECTORvXi1!");
 
   SDLoc dl(Op);
   if (ISD::isBuildVectorAllZeros(Op.getNode()))
     return Op;
 
   if (ISD::isBuildVectorAllOnes(Op.getNode()))
     return Op;
 
   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
       // Split the pieces.
       SDValue Lower =
           DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
       SDValue Upper =
           DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
       // We have to manually lower both halves so getNode doesn't try to
       // reassemble the build_vector.
       Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
       Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
     }
     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
       return DAG.getBitcast(VT, Imm);
     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                         DAG.getIntPtrConstant(0, dl));
   }
 
   // Vector has one or more non-const elements
   uint64_t Immediate = 0;
   SmallVector<unsigned, 16> NonConstIdx;
   bool IsSplat = true;
   bool HasConstElts = false;
   int SplatIdx = -1;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (In.isUndef())
       continue;
     if (!isa<ConstantSDNode>(In))
       NonConstIdx.push_back(idx);
     else {
       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
       HasConstElts = true;
     }
     if (SplatIdx < 0)
       SplatIdx = idx;
     else if (In != Op.getOperand(SplatIdx))
       IsSplat = false;
   }
 
   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
   if (IsSplat)
     return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
                          DAG.getConstant(1, dl, VT),
                          DAG.getConstant(0, dl, VT));
 
   // insert elements one by one
   SDValue DstVec;
   SDValue Imm;
   if (Immediate) {
     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
     Imm = DAG.getConstant(Immediate, dl, ImmVT);
   }
   else if (HasConstElts)
     Imm = DAG.getConstant(0, dl, VT);
   else
     Imm = DAG.getUNDEF(VT);
   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
     DstVec = DAG.getBitcast(VT, Imm);
   else {
     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
                          DAG.getIntPtrConstant(0, dl));
   }
 
   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
     unsigned InsertIdx = NonConstIdx[i];
     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                          Op.getOperand(InsertIdx),
                          DAG.getIntPtrConstant(InsertIdx, dl));
   }
   return DstVec;
 }
 
 /// This is a helper function of LowerToHorizontalOp().
 /// This function checks that the build_vector \p N in input implements a
 /// 128-bit partial horizontal operation on a 256-bit vector, but that operation
 /// may not match the layout of an x86 256-bit horizontal instruction.
 /// In other words, if this returns true, then some extraction/insertion will
 /// be required to produce a valid horizontal instruction.
 ///
 /// Parameter \p Opcode defines the kind of horizontal operation to match.
 /// For example, if \p Opcode is equal to ISD::ADD, then this function
 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
 /// is equal to ISD::SUB, then this function checks if this is a horizontal
 /// arithmetic sub.
 ///
 /// This function only analyzes elements of \p N whose indices are
 /// in range [BaseIdx, LastIdx).
 ///
 /// TODO: This function was originally used to match both real and fake partial
 /// horizontal operations, but the index-matching logic is incorrect for that.
 /// See the corrected implementation in isHopBuildVector(). Can we reduce this
 /// code because it is only used for partial h-op matching now?
 static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
                                   SelectionDAG &DAG,
                                   unsigned BaseIdx, unsigned LastIdx,
                                   SDValue &V0, SDValue &V1) {
   EVT VT = N->getValueType(0);
   assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
          "Invalid Vector in input!");
 
   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
   bool CanFold = true;
   unsigned ExpectedVExtractIdx = BaseIdx;
   unsigned NumElts = LastIdx - BaseIdx;
   V0 = DAG.getUNDEF(VT);
   V1 = DAG.getUNDEF(VT);
 
   // Check if N implements a horizontal binop.
   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
     SDValue Op = N->getOperand(i + BaseIdx);
 
     // Skip UNDEFs.
     if (Op->isUndef()) {
       // Update the expected vector extract index.
       if (i * 2 == NumElts)
         ExpectedVExtractIdx = BaseIdx;
       ExpectedVExtractIdx += 2;
       continue;
     }
 
     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
 
     if (!CanFold)
       break;
 
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
     // Try to match the following pattern:
     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op0.getOperand(0) == Op1.getOperand(0) &&
         isa<ConstantSDNode>(Op0.getOperand(1)) &&
         isa<ConstantSDNode>(Op1.getOperand(1)));
     if (!CanFold)
       break;
 
     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
 
     if (i * 2 < NumElts) {
       if (V0.isUndef()) {
         V0 = Op0.getOperand(0);
         if (V0.getValueType() != VT)
           return false;
       }
     } else {
       if (V1.isUndef()) {
         V1 = Op0.getOperand(0);
         if (V1.getValueType() != VT)
           return false;
       }
       if (i * 2 == NumElts)
         ExpectedVExtractIdx = BaseIdx;
     }
 
     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
     if (I0 == ExpectedVExtractIdx)
       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
       // Try to match the following dag sequence:
       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
     } else
       CanFold = false;
 
     ExpectedVExtractIdx += 2;
   }
 
   return CanFold;
 }
 
 /// Emit a sequence of two 128-bit horizontal add/sub followed by
 /// a concat_vector.
 ///
 /// This is a helper function of LowerToHorizontalOp().
 /// This function expects two 256-bit vectors called V0 and V1.
 /// At first, each vector is split into two separate 128-bit vectors.
 /// Then, the resulting 128-bit vectors are used to implement two
 /// horizontal binary operations.
 ///
 /// The kind of horizontal binary operation is defined by \p X86Opcode.
 ///
 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
 /// the two new horizontal binop.
 /// When Mode is set, the first horizontal binop dag node would take as input
 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
 /// horizontal binop dag node would take as input the lower 128-bit of V1
 /// and the upper 128-bit of V1.
 ///   Example:
 ///     HADD V0_LO, V0_HI
 ///     HADD V1_LO, V1_HI
 ///
 /// Otherwise, the first horizontal binop dag node takes as input the lower
 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
 ///   Example:
 ///     HADD V0_LO, V1_LO
 ///     HADD V0_HI, V1_HI
 ///
 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
 /// the upper 128-bits of the result.
 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
                                      const SDLoc &DL, SelectionDAG &DAG,
                                      unsigned X86Opcode, bool Mode,
                                      bool isUndefLO, bool isUndefHI) {
   MVT VT = V0.getSimpleValueType();
   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
          "Invalid nodes in input!");
 
   unsigned NumElts = VT.getVectorNumElements();
   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
   MVT NewVT = V0_LO.getSimpleValueType();
 
   SDValue LO = DAG.getUNDEF(NewVT);
   SDValue HI = DAG.getUNDEF(NewVT);
 
   if (Mode) {
     // Don't emit a horizontal binop if the result is expected to be UNDEF.
     if (!isUndefLO && !V0->isUndef())
       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
     if (!isUndefHI && !V1->isUndef())
       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
   } else {
     // Don't emit a horizontal binop if the result is expected to be UNDEF.
     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
 
     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
   }
 
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
 }
 
 /// Returns true iff \p BV builds a vector with the result equivalent to
 /// the result of ADDSUB/SUBADD operation.
 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
 /// \p Opnd0 and \p Opnd1.
 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
                              const X86Subtarget &Subtarget, SelectionDAG &DAG,
                              SDValue &Opnd0, SDValue &Opnd1,
                              unsigned &NumExtracts,
                              bool &IsSubAdd) {
 
   MVT VT = BV->getSimpleValueType(0);
   if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   SDValue InVec0 = DAG.getUNDEF(VT);
   SDValue InVec1 = DAG.getUNDEF(VT);
 
   NumExtracts = 0;
 
   // Odd-numbered elements in the input build vector are obtained from
   // adding/subtracting two integer/float elements.
   // Even-numbered elements in the input build vector are obtained from
   // subtracting/adding two integer/float elements.
   unsigned Opc[2] = {0, 0};
   for (unsigned i = 0, e = NumElts; i != e; ++i) {
     SDValue Op = BV->getOperand(i);
 
     // Skip 'undef' values.
     unsigned Opcode = Op.getOpcode();
     if (Opcode == ISD::UNDEF)
       continue;
 
     // Early exit if we found an unexpected opcode.
     if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
       return false;
 
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
     // Try to match the following pattern:
     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
     // Early exit if we cannot match that sequence.
     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
         Op0.getOperand(1) != Op1.getOperand(1))
       return false;
 
     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
     if (I0 != i)
       return false;
 
     // We found a valid add/sub node, make sure its the same opcode as previous
     // elements for this parity.
     if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
       return false;
     Opc[i % 2] = Opcode;
 
     // Update InVec0 and InVec1.
     if (InVec0.isUndef()) {
       InVec0 = Op0.getOperand(0);
       if (InVec0.getSimpleValueType() != VT)
         return false;
     }
     if (InVec1.isUndef()) {
       InVec1 = Op1.getOperand(0);
       if (InVec1.getSimpleValueType() != VT)
         return false;
     }
 
     // Make sure that operands in input to each add/sub node always
     // come from a same pair of vectors.
     if (InVec0 != Op0.getOperand(0)) {
       if (Opcode == ISD::FSUB)
         return false;
 
       // FADD is commutable. Try to commute the operands
       // and then test again.
       std::swap(Op0, Op1);
       if (InVec0 != Op0.getOperand(0))
         return false;
     }
 
     if (InVec1 != Op1.getOperand(0))
       return false;
 
     // Increment the number of extractions done.
     ++NumExtracts;
   }
 
   // Ensure we have found an opcode for both parities and that they are
   // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
   // inputs are undef.
   if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
       InVec0.isUndef() || InVec1.isUndef())
     return false;
 
   IsSubAdd = Opc[0] == ISD::FADD;
 
   Opnd0 = InVec0;
   Opnd1 = InVec1;
   return true;
 }
 
 /// Returns true if is possible to fold MUL and an idiom that has already been
 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
 ///
 /// Prior to calling this function it should be known that there is some
 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
 /// before replacement of such SDNode with ADDSUB operation. Thus the number
 /// of \p Opnd0 uses is expected to be equal to 2.
 /// For example, this function may be called for the following IR:
 ///    %AB = fmul fast <2 x double> %A, %B
 ///    %Sub = fsub fast <2 x double> %AB, %C
 ///    %Add = fadd fast <2 x double> %AB, %C
 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
 ///                            <2 x i32> <i32 0, i32 3>
 /// There is a def for %Addsub here, which potentially can be replaced by
 /// X86ISD::ADDSUB operation:
 ///    %Addsub = X86ISD::ADDSUB %AB, %C
 /// and such ADDSUB can further be replaced with FMADDSUB:
 ///    %Addsub = FMADDSUB %A, %B, %C.
 ///
 /// The main reason why this method is called before the replacement of the
 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
 /// FMADDSUB is.
 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG,
                                  SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
                                  unsigned ExpectedUses) {
   if (Opnd0.getOpcode() != ISD::FMUL ||
       !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
     return false;
 
   // FIXME: These checks must match the similar ones in
   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
   // or MUL + ADDSUB to FMADDSUB.
   const TargetOptions &Options = DAG.getTarget().Options;
   bool AllowFusion =
       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
   if (!AllowFusion)
     return false;
 
   Opnd2 = Opnd1;
   Opnd1 = Opnd0.getOperand(1);
   Opnd0 = Opnd0.getOperand(0);
 
   return true;
 }
 
 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
 /// X86ISD::FMSUBADD node.
 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   SDValue Opnd0, Opnd1;
   unsigned NumExtracts;
   bool IsSubAdd;
   if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
                         IsSubAdd))
     return SDValue();
 
   MVT VT = BV->getSimpleValueType(0);
   SDLoc DL(BV);
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
   }
 
   // We only support ADDSUB.
   if (IsSubAdd)
     return SDValue();
 
   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
   // the ADDSUB idiom has been successfully recognized. There are no known
   // X86 targets with 512-bit ADDSUB instructions!
   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
   // recognition.
   if (VT.is512BitVector())
     return SDValue();
 
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
 static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,
                              unsigned &HOpcode, SDValue &V0, SDValue &V1) {
   // Initialize outputs to known values.
   MVT VT = BV->getSimpleValueType(0);
   HOpcode = ISD::DELETED_NODE;
   V0 = DAG.getUNDEF(VT);
   V1 = DAG.getUNDEF(VT);
 
   // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
   // half of the result is calculated independently from the 128-bit halves of
   // the inputs, so that makes the index-checking logic below more complicated.
   unsigned NumElts = VT.getVectorNumElements();
   unsigned GenericOpcode = ISD::DELETED_NODE;
   unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
   unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
   unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
   for (unsigned i = 0; i != Num128BitChunks; ++i) {
     for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
       // Ignore undef elements.
       SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
       if (Op.isUndef())
         continue;
 
       // If there's an opcode mismatch, we're done.
       if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
         return false;
 
       // Initialize horizontal opcode.
       if (HOpcode == ISD::DELETED_NODE) {
         GenericOpcode = Op.getOpcode();
         switch (GenericOpcode) {
         case ISD::ADD: HOpcode = X86ISD::HADD; break;
         case ISD::SUB: HOpcode = X86ISD::HSUB; break;
         case ISD::FADD: HOpcode = X86ISD::FHADD; break;
         case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
         default: return false;
         }
       }
 
       SDValue Op0 = Op.getOperand(0);
       SDValue Op1 = Op.getOperand(1);
       if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
           Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
           Op0.getOperand(0) != Op1.getOperand(0) ||
           !isa<ConstantSDNode>(Op0.getOperand(1)) ||
           !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
         return false;
 
       // The source vector is chosen based on which 64-bit half of the
       // destination vector is being calculated.
       if (j < NumEltsIn64Bits) {
         if (V0.isUndef())
           V0 = Op0.getOperand(0);
       } else {
         if (V1.isUndef())
           V1 = Op0.getOperand(0);
       }
 
       SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
       if (SourceVec != Op0.getOperand(0))
         return false;
 
       // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
       unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
       unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
       unsigned ExpectedIndex = i * NumEltsIn128Bits +
                                (j % NumEltsIn64Bits) * 2;
       if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
         continue;
 
       // If this is not a commutative op, this does not match.
       if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
         return false;
 
       // Addition is commutative, so try swapping the extract indexes.
       // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
       if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
         continue;
 
       // Extract indexes do not match horizontal requirement.
       return false;
     }
   }
   // We matched. Opcode and operands are returned by reference as arguments.
   return true;
 }
 
 static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,
                                     SelectionDAG &DAG, unsigned HOpcode,
                                     SDValue V0, SDValue V1) {
   // If either input vector is not the same size as the build vector,
   // extract/insert the low bits to the correct size.
   // This is free (examples: zmm --> xmm, xmm --> ymm).
   MVT VT = BV->getSimpleValueType(0);
   unsigned Width = VT.getSizeInBits();
   if (V0.getValueSizeInBits() > Width)
     V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);
   else if (V0.getValueSizeInBits() < Width)
     V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);
 
   if (V1.getValueSizeInBits() > Width)
     V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);
   else if (V1.getValueSizeInBits() < Width)
     V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);
 
   return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);
 }
 
 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   // We need at least 2 non-undef elements to make this worthwhile by default.
   unsigned NumNonUndefs = 0;
   for (const SDValue &V : BV->op_values())
     if (!V.isUndef())
       ++NumNonUndefs;
 
   if (NumNonUndefs < 2)
     return SDValue();
 
   // There are 4 sets of horizontal math operations distinguished by type:
   // int/FP at 128-bit/256-bit. Each type was introduced with a different
   // subtarget feature. Try to match those "native" patterns first.
   MVT VT = BV->getSimpleValueType(0);
   unsigned HOpcode;
   SDValue V0, V1;
   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3())
     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
 
   if ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3())
     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
 
   if ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX())
     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
 
   if ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())
     if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
       return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);
 
   // Try harder to match 256-bit ops by using extract/concat.
   if (!Subtarget.hasAVX() || !VT.is256BitVector())
     return SDValue();
 
   // Count the number of UNDEF operands in the build_vector in input.
   unsigned NumElts = VT.getVectorNumElements();
   unsigned Half = NumElts / 2;
   unsigned NumUndefsLO = 0;
   unsigned NumUndefsHI = 0;
   for (unsigned i = 0, e = Half; i != e; ++i)
     if (BV->getOperand(i)->isUndef())
       NumUndefsLO++;
 
   for (unsigned i = Half, e = NumElts; i != e; ++i)
     if (BV->getOperand(i)->isUndef())
       NumUndefsHI++;
 
   SDLoc DL(BV);
   SDValue InVec0, InVec1;
   if (VT == MVT::v8i32 || VT == MVT::v16i16) {
     SDValue InVec2, InVec3;
     unsigned X86Opcode;
     bool CanFold = true;
 
     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
         isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,
                               InVec3) &&
         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       X86Opcode = X86ISD::HADD;
     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,
                                    InVec1) &&
              isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,
                                    InVec3) &&
              ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
              ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
       X86Opcode = X86ISD::HSUB;
     else
       CanFold = false;
 
     if (CanFold) {
       // Do not try to expand this build_vector into a pair of horizontal
       // add/sub if we can emit a pair of scalar add/sub.
       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
         return SDValue();
 
       // Convert this build_vector into a pair of horizontal binops followed by
       // a concat vector. We must adjust the outputs from the partial horizontal
       // matching calls above to account for undefined vector halves.
       SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
       SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
       assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
       bool isUndefLO = NumUndefsLO == Half;
       bool isUndefHI = NumUndefsHI == Half;
       return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
                                    isUndefHI);
     }
   }
 
   if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
       VT == MVT::v16i16) {
     unsigned X86Opcode;
     if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
       X86Opcode = X86ISD::HADD;
     else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::HSUB;
     else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::FHADD;
     else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,
                                    InVec1))
       X86Opcode = X86ISD::FHSUB;
     else
       return SDValue();
 
     // Don't try to expand this build_vector into a pair of horizontal add/sub
     // if we can simply emit a pair of scalar add/sub.
     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
       return SDValue();
 
     // Convert this build_vector into two horizontal add/sub followed by
     // a concat vector.
     bool isUndefLO = NumUndefsLO == Half;
     bool isUndefHI = NumUndefsHI == Half;
     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
                                  isUndefLO, isUndefHI);
   }
 
   return SDValue();
 }
 
 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
 /// just apply the bit to the vectors.
 /// NOTE: Its not in our interest to start make a general purpose vectorizer
 /// from this, but enough scalar bit operations are created from the later
 /// legalization + scalarization stages to need basic support.
 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
                                        SelectionDAG &DAG) {
   SDLoc DL(Op);
   MVT VT = Op->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Check that all elements have the same opcode.
   // TODO: Should we allow UNDEFS and if so how many?
   unsigned Opcode = Op->getOperand(0).getOpcode();
   for (unsigned i = 1; i < NumElems; ++i)
     if (Opcode != Op->getOperand(i).getOpcode())
       return SDValue();
 
   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
   switch (Opcode) {
   default:
     return SDValue();
   case ISD::AND:
   case ISD::XOR:
   case ISD::OR:
     // Don't do this if the buildvector is a splat - we'd replace one
     // constant with an entire vector.
     if (Op->getSplatValue())
       return SDValue();
     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
       return SDValue();
     break;
   }
 
   SmallVector<SDValue, 4> LHSElts, RHSElts;
   for (SDValue Elt : Op->ops()) {
     SDValue LHS = Elt.getOperand(0);
     SDValue RHS = Elt.getOperand(1);
 
     // We expect the canonicalized RHS operand to be the constant.
     if (!isa<ConstantSDNode>(RHS))
       return SDValue();
     LHSElts.push_back(LHS);
     RHSElts.push_back(RHS);
   }
 
   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
 }
 
 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
 /// functionality to do this, so it's all zeros, all ones, or some derivation
 /// that is cheap to calculate.
 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
 
   // Vectors containing all zeros can be matched by pxor and xorps.
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
       return Op;
 
     return getZeroVector(VT, Subtarget, DAG, DL);
   }
 
   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
   // vpcmpeqd on 256-bit vectors.
   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
         (VT == MVT::v8i32 && Subtarget.hasInt256()))
       return Op;
 
     return getOnesVector(VT, DAG, DL);
   }
 
   return SDValue();
 }
 
 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
 /// from a vector of source values and a vector of extraction indices.
 /// The vectors might be manipulated to match the type of the permute op.
 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
                                      SDLoc &DL, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   MVT ShuffleVT = VT;
   EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned SizeInBits = VT.getSizeInBits();
 
   // Adjust IndicesVec to match VT size.
   assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
          "Illegal variable permute mask size");
   if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
     IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
                                   NumElts * VT.getScalarSizeInBits());
   IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
 
   // Handle SrcVec that don't match VT type.
   if (SrcVec.getValueSizeInBits() != SizeInBits) {
     if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
       // Handle larger SrcVec by treating it as a larger permute.
       unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
       VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
                                   Subtarget, DAG, SDLoc(IndicesVec));
       return extractSubVector(
           createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
           DAG, DL, SizeInBits);
     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
       // Widen smaller SrcVec to match VT.
       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
     } else
       return SDValue();
   }
 
   auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
     assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
     EVT SrcVT = Idx.getValueType();
     unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
     uint64_t IndexScale = 0;
     uint64_t IndexOffset = 0;
 
     // If we're scaling a smaller permute op, then we need to repeat the
     // indices, scaling and offsetting them as well.
     // e.g. v4i32 -> v16i8 (Scale = 4)
     // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
     // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
     for (uint64_t i = 0; i != Scale; ++i) {
       IndexScale |= Scale << (i * NumDstBits);
       IndexOffset |= i << (i * NumDstBits);
     }
 
     Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
                       DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
     Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
                       DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
     return Idx;
   };
 
   unsigned Opcode = 0;
   switch (VT.SimpleTy) {
   default:
     break;
   case MVT::v16i8:
     if (Subtarget.hasSSSE3())
       Opcode = X86ISD::PSHUFB;
     break;
   case MVT::v8i16:
     if (Subtarget.hasVLX() && Subtarget.hasBWI())
       Opcode = X86ISD::VPERMV;
     else if (Subtarget.hasSSSE3()) {
       Opcode = X86ISD::PSHUFB;
       ShuffleVT = MVT::v16i8;
     }
     break;
   case MVT::v4f32:
   case MVT::v4i32:
     if (Subtarget.hasAVX()) {
       Opcode = X86ISD::VPERMILPV;
       ShuffleVT = MVT::v4f32;
     } else if (Subtarget.hasSSSE3()) {
       Opcode = X86ISD::PSHUFB;
       ShuffleVT = MVT::v16i8;
     }
     break;
   case MVT::v2f64:
   case MVT::v2i64:
     if (Subtarget.hasAVX()) {
       // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
       Opcode = X86ISD::VPERMILPV;
       ShuffleVT = MVT::v2f64;
     } else if (Subtarget.hasSSE41()) {
       // SSE41 can compare v2i64 - select between indices 0 and 1.
       return DAG.getSelectCC(
           DL, IndicesVec,
           getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
           DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
           ISD::CondCode::SETEQ);
     }
     break;
   case MVT::v32i8:
     if (Subtarget.hasVLX() && Subtarget.hasVBMI())
       Opcode = X86ISD::VPERMV;
     else if (Subtarget.hasXOP()) {
       SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
       SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
       SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
       SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
       return DAG.getNode(
           ISD::CONCAT_VECTORS, DL, VT,
           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
           DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
     } else if (Subtarget.hasAVX()) {
       SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
       SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
       SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
       SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
       auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                               ArrayRef<SDValue> Ops) {
         // Permute Lo and Hi and then select based on index range.
         // This works as SHUFB uses bits[3:0] to permute elements and we don't
         // care about the bit[7] as its just an index vector.
         SDValue Idx = Ops[2];
         EVT VT = Idx.getValueType();
         return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
                                DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
                                ISD::CondCode::SETGT);
       };
       SDValue Ops[] = {LoLo, HiHi, IndicesVec};
       return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
                               PSHUFBBuilder);
     }
     break;
   case MVT::v16i16:
     if (Subtarget.hasVLX() && Subtarget.hasBWI())
       Opcode = X86ISD::VPERMV;
     else if (Subtarget.hasAVX()) {
       // Scale to v32i8 and perform as v32i8.
       IndicesVec = ScaleIndices(IndicesVec, 2);
       return DAG.getBitcast(
           VT, createVariablePermute(
                   MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
                   DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
     }
     break;
   case MVT::v8f32:
   case MVT::v8i32:
     if (Subtarget.hasAVX2())
       Opcode = X86ISD::VPERMV;
     else if (Subtarget.hasAVX()) {
       SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
       SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
                                           {0, 1, 2, 3, 0, 1, 2, 3});
       SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
                                           {4, 5, 6, 7, 4, 5, 6, 7});
       if (Subtarget.hasXOP())
         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
                                               LoLo, HiHi, IndicesVec,
                                               DAG.getConstant(0, DL, MVT::i8)));
       // Permute Lo and Hi and then select based on index range.
       // This works as VPERMILPS only uses index bits[0:1] to permute elements.
       SDValue Res = DAG.getSelectCC(
           DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
           ISD::CondCode::SETGT);
       return DAG.getBitcast(VT, Res);
     }
     break;
   case MVT::v4i64:
   case MVT::v4f64:
     if (Subtarget.hasAVX512()) {
       if (!Subtarget.hasVLX()) {
         MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
         SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
                                 SDLoc(SrcVec));
         IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
                                     DAG, SDLoc(IndicesVec));
         SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
                                             DAG, Subtarget);
         return extract256BitVector(Res, 0, DAG, DL);
       }
       Opcode = X86ISD::VPERMV;
     } else if (Subtarget.hasAVX()) {
       SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
       SDValue LoLo =
           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
       SDValue HiHi =
           DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
       // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
       IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
       if (Subtarget.hasXOP())
         return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
                                               LoLo, HiHi, IndicesVec,
                                               DAG.getConstant(0, DL, MVT::i8)));
       // Permute Lo and Hi and then select based on index range.
       // This works as VPERMILPD only uses index bit[1] to permute elements.
       SDValue Res = DAG.getSelectCC(
           DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
           DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
           ISD::CondCode::SETGT);
       return DAG.getBitcast(VT, Res);
     }
     break;
   case MVT::v64i8:
     if (Subtarget.hasVBMI())
       Opcode = X86ISD::VPERMV;
     break;
   case MVT::v32i16:
     if (Subtarget.hasBWI())
       Opcode = X86ISD::VPERMV;
     break;
   case MVT::v16f32:
   case MVT::v16i32:
   case MVT::v8f64:
   case MVT::v8i64:
     if (Subtarget.hasAVX512())
       Opcode = X86ISD::VPERMV;
     break;
   }
   if (!Opcode)
     return SDValue();
 
   assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
          (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
          "Illegal variable permute shuffle type");
 
   uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
   if (Scale > 1)
     IndicesVec = ScaleIndices(IndicesVec, Scale);
 
   EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
   IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
 
   SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
   SDValue Res = Opcode == X86ISD::VPERMV
                     ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
                     : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
   return DAG.getBitcast(VT, Res);
 }
 
 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
 // reasoned to be a permutation of a vector by indices in a non-constant vector.
 // (build_vector (extract_elt V, (extract_elt I, 0)),
 //               (extract_elt V, (extract_elt I, 1)),
 //                    ...
 // ->
 // (vpermv I, V)
 //
 // TODO: Handle undefs
 // TODO: Utilize pshufb and zero mask blending to support more efficient
 // construction of vectors with constant-0 elements.
 static SDValue
 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   SDValue SrcVec, IndicesVec;
   // Check for a match of the permute source vector and permute index elements.
   // This is done by checking that the i-th build_vector operand is of the form:
   // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
   for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
     SDValue Op = V.getOperand(Idx);
     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
 
     // If this is the first extract encountered in V, set the source vector,
     // otherwise verify the extract is from the previously defined source
     // vector.
     if (!SrcVec)
       SrcVec = Op.getOperand(0);
     else if (SrcVec != Op.getOperand(0))
       return SDValue();
     SDValue ExtractedIndex = Op->getOperand(1);
     // Peek through extends.
     if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
         ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
       ExtractedIndex = ExtractedIndex.getOperand(0);
     if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
 
     // If this is the first extract from the index vector candidate, set the
     // indices vector, otherwise verify the extract is from the previously
     // defined indices vector.
     if (!IndicesVec)
       IndicesVec = ExtractedIndex.getOperand(0);
     else if (IndicesVec != ExtractedIndex.getOperand(0))
       return SDValue();
 
     auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
     if (!PermIdx || PermIdx->getZExtValue() != Idx)
       return SDValue();
   }
 
   SDLoc DL(V);
   MVT VT = V.getSimpleValueType();
   return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
 }
 
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Op.getNumOperands();
 
   // Generate vectors for predicate vectors.
   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
     return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
 
   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
     return VectorConstant;
 
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
     return AddSub;
   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
     return HorizontalOp;
   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
     return Broadcast;
   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
     return BitOp;
 
   unsigned EVTBits = EltVT.getSizeInBits();
 
   unsigned NumZero  = 0;
   unsigned NumNonZero = 0;
   uint64_t NonZeros = 0;
   bool IsAllConstants = true;
   SmallSet<SDValue, 8> Values;
   unsigned NumConstants = NumElems;
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue Elt = Op.getOperand(i);
     if (Elt.isUndef())
       continue;
     Values.insert(Elt);
     if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
       IsAllConstants = false;
       NumConstants--;
     }
     if (X86::isZeroNode(Elt))
       NumZero++;
     else {
       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
       NonZeros |= ((uint64_t)1 << i);
       NumNonZero++;
     }
   }
 
   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
   if (NumNonZero == 0)
     return DAG.getUNDEF(VT);
 
   // If we are inserting one variable into a vector of non-zero constants, try
   // to avoid loading each constant element as a scalar. Load the constants as a
   // vector and then insert the variable scalar element. If insertion is not
   // supported, fall back to a shuffle to get the scalar blended with the
   // constants. Insertion into a zero vector is handled as a special-case
   // somewhere below here.
   if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
       (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
        isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
     // Create an all-constant vector. The variable element in the old
     // build vector is replaced by undef in the constant vector. Save the
     // variable scalar element and its index for use in the insertelement.
     LLVMContext &Context = *DAG.getContext();
     Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
     SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
     SDValue VarElt;
     SDValue InsIndex;
     for (unsigned i = 0; i != NumElems; ++i) {
       SDValue Elt = Op.getOperand(i);
       if (auto *C = dyn_cast<ConstantSDNode>(Elt))
         ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
       else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
         ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
       else if (!Elt.isUndef()) {
         assert(!VarElt.getNode() && !InsIndex.getNode() &&
                "Expected one variable element in this vector");
         VarElt = Elt;
         InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
       }
     }
     Constant *CV = ConstantVector::get(ConstVecOps);
     SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
 
     // The constants we just created may not be legal (eg, floating point). We
     // must lower the vector right here because we can not guarantee that we'll
     // legalize it before loading it. This is also why we could not just create
     // a new build vector here. If the build vector contains illegal constants,
     // it could get split back up into a series of insert elements.
     // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
     SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
     MachineFunction &MF = DAG.getMachineFunction();
     MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
     SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
     unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();
     unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
     if (InsertC < NumEltsInLow128Bits)
       return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
 
     // There's no good way to insert into the high elements of a >128-bit
     // vector, so use shuffles to avoid an extract/insert sequence.
     assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
     assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
     SmallVector<int, 8> ShuffleMask;
     unsigned NumElts = VT.getVectorNumElements();
     for (unsigned i = 0; i != NumElts; ++i)
       ShuffleMask.push_back(i == InsertC ? NumElts : i);
     SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
     return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
   }
 
   // Special case for single non-zero, non-undef, element.
   if (NumNonZero == 1) {
     unsigned Idx = countTrailingZeros(NonZeros);
     SDValue Item = Op.getOperand(Idx);
 
     // If we have a constant or non-constant insertion into the low element of
     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
     // depending on what the source datatype is.
     if (Idx == 0) {
       if (NumZero == 0)
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
       if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
           (EltVT == MVT::i64 && Subtarget.is64Bit())) {
         assert((VT.is128BitVector() || VT.is256BitVector() ||
                 VT.is512BitVector()) &&
                "Expected an SSE value type!");
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
       }
 
       // We can't directly insert an i8 or i16 into a vector, so zero extend
       // it to i32 first.
       if (EltVT == MVT::i16 || EltVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
         if (VT.getSizeInBits() >= 256) {
           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
           if (Subtarget.hasAVX()) {
             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
           } else {
             // Without AVX, we need to extend to a 128-bit vector and then
             // insert into the 256-bit vector.
             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
           }
         } else {
           assert(VT.is128BitVector() && "Expected an SSE value type!");
           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
         }
         return DAG.getBitcast(VT, Item);
       }
     }
 
     // Is it a vector logical left shift?
     if (NumElems == 2 && Idx == 1 &&
         X86::isZeroNode(Op.getOperand(0)) &&
         !X86::isZeroNode(Op.getOperand(1))) {
       unsigned NumBits = VT.getSizeInBits();
       return getVShift(true, VT,
                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                    VT, Op.getOperand(1)),
                        NumBits/2, DAG, *this, dl);
     }
 
     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
       return SDValue();
 
     // Otherwise, if this is a vector with i32 or f32 elements, and the element
     // is a non-constant being inserted into an element other than the low one,
     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
     // movd/movss) to move this into the low element, then shuffle it into
     // place.
     if (EVTBits == 32) {
       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
     }
   }
 
   // Splat is obviously ok. Let legalizer expand it to a shuffle.
   if (Values.size() == 1) {
     if (EVTBits == 32) {
       // Instead of a shuffle like this:
       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
       // Check if it's possible to issue this instead.
       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
       unsigned Idx = countTrailingZeros(NonZeros);
       SDValue Item = Op.getOperand(Idx);
       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
     }
     return SDValue();
   }
 
   // A vector full of immediates; various special cases are already
   // handled, so this is best done with a single constant-pool load.
   if (IsAllConstants)
     return SDValue();
 
   if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
       return V;
 
   // See if we can use a vector load to get all of the elements.
   {
     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
     if (SDValue LD =
             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
       return LD;
   }
 
   // If this is a splat of pairs of 32-bit elements, we can use a narrower
   // build_vector and broadcast it.
   // TODO: We could probably generalize this more.
   if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
     SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
                        DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
     auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
       // Make sure all the even/odd operands match.
       for (unsigned i = 2; i != NumElems; ++i)
         if (Ops[i % 2] != Op.getOperand(i))
           return false;
       return true;
     };
     if (CanSplat(Op, NumElems, Ops)) {
       MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
       MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
       // Create a new build vector and cast to v2i64/v2f64.
       SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
                                      DAG.getBuildVector(NarrowVT, dl, Ops));
       // Broadcast from v2i64/v2f64 and cast to final VT.
       MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
                                             NewBV));
     }
   }
 
   // For AVX-length vectors, build the individual 128-bit pieces and use
   // shuffles to put them in place.
   if (VT.getSizeInBits() > 128) {
     MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
 
     // Build both the lower and upper subvector.
     SDValue Lower =
         DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
     SDValue Upper = DAG.getBuildVector(
         HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
 
     // Recreate the wider vector with the lower and upper part.
     return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
                             VT.getSizeInBits() / 2);
   }
 
   // Let legalizer expand 2-wide build_vectors.
   if (EVTBits == 64) {
     if (NumNonZero == 1) {
       // One half is zero or undef.
       unsigned Idx = countTrailingZeros(NonZeros);
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
     }
     return SDValue();
   }
 
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16)
     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
                                           DAG, Subtarget))
       return V;
 
   if (EVTBits == 16 && NumElems == 8)
     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
                                           DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   if (EVTBits == 32 && NumElems == 4)
     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits, turn it into a number of shuffles.
   if (NumElems == 4 && NumZero > 0) {
     SmallVector<SDValue, 8> Ops(NumElems);
     for (unsigned i = 0; i < 4; ++i) {
       bool isZero = !(NonZeros & (1ULL << i));
       if (isZero)
         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
       else
         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
     }
 
     for (unsigned i = 0; i < 2; ++i) {
       switch ((NonZeros >> (i*2)) & 0x3) {
         default: llvm_unreachable("Unexpected NonZero count");
         case 0:
           Ops[i] = Ops[i*2];  // Must be a zero vector.
           break;
         case 1:
           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
           break;
         case 2:
           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
           break;
         case 3:
           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
           break;
       }
     }
 
     bool Reverse1 = (NonZeros & 0x3) == 2;
     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
     int MaskVec[] = {
       Reverse1 ? 1 : 0,
       Reverse1 ? 0 : 1,
       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
     };
     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
   }
 
   assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
 
   // Check for a build vector from mostly shuffle plus few inserting.
   if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
     return Sh;
 
   // For SSE 4.1, use insertps to put the high elements into the low element.
   if (Subtarget.hasSSE41()) {
     SDValue Result;
     if (!Op.getOperand(0).isUndef())
       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
     else
       Result = DAG.getUNDEF(VT);
 
     for (unsigned i = 1; i < NumElems; ++i) {
       if (Op.getOperand(i).isUndef()) continue;
       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
                            Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
     }
     return Result;
   }
 
   // Otherwise, expand into a number of unpckl*, start by extending each of
   // our (non-undef) elements to the full vector width with the element in the
   // bottom slot of the vector (which generates no code for SSE).
   SmallVector<SDValue, 8> Ops(NumElems);
   for (unsigned i = 0; i < NumElems; ++i) {
     if (!Op.getOperand(i).isUndef())
       Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
     else
       Ops[i] = DAG.getUNDEF(VT);
   }
 
   // Next, we iteratively mix elements, e.g. for v4f32:
   //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
   //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
   //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
   for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
     // Generate scaled UNPCKL shuffle mask.
     SmallVector<int, 16> Mask;
     for(unsigned i = 0; i != Scale; ++i)
       Mask.push_back(i);
     for (unsigned i = 0; i != Scale; ++i)
       Mask.push_back(NumElems+i);
     Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
 
     for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
       Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
   }
   return Ops[0];
 }
 
 // 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 // TODO: Detect subvector broadcast here instead of DAG combine?
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
 
   assert((ResVT.is256BitVector() ||
           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
 
   unsigned NumOperands = Op.getNumOperands();
   unsigned NumZero = 0;
   unsigned NumNonZero = 0;
   unsigned NonZeros = 0;
   for (unsigned i = 0; i != NumOperands; ++i) {
     SDValue SubVec = Op.getOperand(i);
     if (SubVec.isUndef())
       continue;
     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
       ++NumZero;
     else {
       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
       NonZeros |= 1 << i;
       ++NumNonZero;
     }
   }
 
   // If we have more than 2 non-zeros, build each half separately.
   if (NumNonZero > 2) {
     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                   ResVT.getVectorNumElements()/2);
     ArrayRef<SDUse> Ops = Op->ops();
     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
                              Ops.slice(0, NumOperands/2));
     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
                              Ops.slice(NumOperands/2));
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   }
 
   // Otherwise, build it up through insert_subvectors.
   SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
                         : DAG.getUNDEF(ResVT);
 
   MVT SubVT = Op.getOperand(0).getSimpleValueType();
   unsigned NumSubElems = SubVT.getVectorNumElements();
   for (unsigned i = 0; i != NumOperands; ++i) {
     if ((NonZeros & (1 << i)) == 0)
       continue;
 
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
                       Op.getOperand(i),
                       DAG.getIntPtrConstant(i * NumSubElems, dl));
   }
 
   return Vec;
 }
 
 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
 static bool isExpandWithZeros(const SDValue &Op) {
   assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
          "Expand with zeros only possible in CONCAT_VECTORS nodes!");
 
   for (unsigned i = 1; i < Op.getNumOperands(); i++)
     if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
       return false;
 
   return true;
 }
 
 // Returns true if the given node is a type promotion (by concatenating i1
 // zeros) of the result of a node that already zeros all upper bits of
 // k-register.
 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
   unsigned Opc = Op.getOpcode();
 
   assert(Opc == ISD::CONCAT_VECTORS &&
          Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
          "Unexpected node to check for type promotion!");
 
   // As long as we are concatenating zeros to the upper part of a previous node
   // result, climb up the tree until a node with different opcode is
   // encountered
   while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
     if (Opc == ISD::INSERT_SUBVECTOR) {
       if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
           Op.getConstantOperandVal(2) == 0)
         Op = Op.getOperand(1);
       else
         return SDValue();
     } else { // Opc == ISD::CONCAT_VECTORS
       if (isExpandWithZeros(Op))
         Op = Op.getOperand(0);
       else
         return SDValue();
     }
     Opc = Op.getOpcode();
   }
 
   // Check if the first inserted node zeroes the upper bits, or an 'and' result
   // of a node that zeros the upper bits (its masked version).
   if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
       (Op.getOpcode() == ISD::AND &&
        (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
         isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
     return Op;
   }
 
   return SDValue();
 }
 
 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG & DAG) {
   SDLoc dl(Op);
   MVT ResVT = Op.getSimpleValueType();
   unsigned NumOperands = Op.getNumOperands();
 
   assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
          "Unexpected number of operands in CONCAT_VECTORS");
 
   // If this node promotes - by concatenating zeroes - the type of the result
   // of a node with instruction that zeroes all upper (irrelevant) bits of the
   // output register, mark it as legal and catch the pattern in instruction
   // selection to avoid emitting extra instructions (for zeroing upper bits).
   if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
     return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
 
   unsigned NumZero = 0;
   unsigned NumNonZero = 0;
   uint64_t NonZeros = 0;
   for (unsigned i = 0; i != NumOperands; ++i) {
     SDValue SubVec = Op.getOperand(i);
     if (SubVec.isUndef())
       continue;
     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
       ++NumZero;
     else {
       assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
       NonZeros |= (uint64_t)1 << i;
       ++NumNonZero;
     }
   }
 
 
   // If there are zero or one non-zeros we can handle this very simply.
   if (NumNonZero <= 1) {
     SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
                           : DAG.getUNDEF(ResVT);
     if (!NumNonZero)
       return Vec;
     unsigned Idx = countTrailingZeros(NonZeros);
     SDValue SubVec = Op.getOperand(Idx);
     unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
                        DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
   }
 
   if (NumOperands > 2) {
     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
                                   ResVT.getVectorNumElements()/2);
     ArrayRef<SDUse> Ops = Op->ops();
     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
                              Ops.slice(0, NumOperands/2));
     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
                              Ops.slice(NumOperands/2));
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
   }
 
   assert(NumNonZero == 2 && "Simple cases not handled?");
 
   if (ResVT.getVectorNumElements() >= 16)
     return Op; // The operation is legal with KUNPCK
 
   SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
                             DAG.getUNDEF(ResVT), Op.getOperand(0),
                             DAG.getIntPtrConstant(0, dl));
   unsigned NumElems = ResVT.getVectorNumElements();
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
                      DAG.getIntPtrConstant(NumElems/2, dl));
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT.getVectorElementType() == MVT::i1)
     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
 
   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
           Op.getNumOperands() == 4)));
 
   // AVX can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
 
   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
   return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
 }
 
 //===----------------------------------------------------------------------===//
 // Vector shuffle lowering
 //
 // This is an experimental code path for lowering vector shuffles on x86. It is
 // designed to handle arbitrary vector shuffles and blends, gracefully
 // degrading performance as necessary. It works hard to recognize idiomatic
 // shuffles and lower them to optimal instruction patterns without leaving
 // a framework that allows reasonably efficient handling of all vector shuffle
 // patterns.
 //===----------------------------------------------------------------------===//
 
 /// Tiny helper function to identify a no-op mask.
 ///
 /// This is a somewhat boring predicate function. It checks whether the mask
 /// array input, which is assumed to be a single-input shuffle mask of the kind
 /// used by the X86 shuffle instructions (not a fully general
 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
 /// in-place shuffle are 'no-op's.
 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     assert(Mask[i] >= -1 && "Out of bound mask element!");
     if (Mask[i] >= 0 && Mask[i] != i)
       return false;
   }
   return true;
 }
 
 /// Test whether there are elements crossing 128-bit lanes in this
 /// shuffle mask.
 ///
 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
 /// and we routinely test for these.
 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
   int LaneSize = 128 / VT.getScalarSizeInBits();
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
       return true;
   return false;
 }
 
 /// Test whether a shuffle mask is equivalent within each sub-lane.
 ///
 /// This checks a shuffle mask to see if it is performing the same
 /// lane-relative shuffle in each sub-lane. This trivially implies
 /// that it is also not lane-crossing. It may however involve a blend from the
 /// same lane of a second vector.
 ///
 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
 /// non-trivial to compute in the face of undef lanes. The representation is
 /// suitable for use with existing 128-bit shuffles as entries from the second
 /// vector have been remapped to [LaneSize, 2*LaneSize).
 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
                                   ArrayRef<int> Mask,
                                   SmallVectorImpl<int> &RepeatedMask) {
   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   RepeatedMask.assign(LaneSize, -1);
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i) {
     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
     if (Mask[i] < 0)
       continue;
     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
       // This entry crosses lanes, so there is no way to model this shuffle.
       return false;
 
     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
     // Adjust second vector indices to start at LaneSize instead of Size.
     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
                                 : Mask[i] % LaneSize + LaneSize;
     if (RepeatedMask[i % LaneSize] < 0)
       // This is the first non-undef entry in this slot of a 128-bit lane.
       RepeatedMask[i % LaneSize] = LocalM;
     else if (RepeatedMask[i % LaneSize] != LocalM)
       // Found a mismatch with the repeated mask.
       return false;
   }
   return true;
 }
 
 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
 static bool
 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
                                 SmallVectorImpl<int> &RepeatedMask) {
   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
 }
 
 static bool
 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
   SmallVector<int, 32> RepeatedMask;
   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
 }
 
 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
 static bool
 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
                                 SmallVectorImpl<int> &RepeatedMask) {
   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
 }
 
 /// Test whether a target shuffle mask is equivalent within each sub-lane.
 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
                                         ArrayRef<int> Mask,
                                         SmallVectorImpl<int> &RepeatedMask) {
   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i) {
     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
     if (Mask[i] == SM_SentinelUndef)
       continue;
     if (Mask[i] == SM_SentinelZero) {
       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
         return false;
       RepeatedMask[i % LaneSize] = SM_SentinelZero;
       continue;
     }
     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
       // This entry crosses lanes, so there is no way to model this shuffle.
       return false;
 
     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
     // Adjust second vector indices to start at LaneSize instead of Size.
     int LocalM =
         Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
       // This is the first non-undef entry in this slot of a 128-bit lane.
       RepeatedMask[i % LaneSize] = LocalM;
     else if (RepeatedMask[i % LaneSize] != LocalM)
       // Found a mismatch with the repeated mask.
       return false;
   }
   return true;
 }
 
 /// Checks whether a shuffle mask is equivalent to an explicit list of
 /// arguments.
 ///
 /// This is a fast way to test a shuffle mask against a fixed pattern:
 ///
 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
 ///
 /// It returns true if the mask is exactly as wide as the argument list, and
 /// each element of the mask is either -1 (signifying undef) or the value given
 /// in the argument.
 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
                                 ArrayRef<int> ExpectedMask) {
   if (Mask.size() != ExpectedMask.size())
     return false;
 
   int Size = Mask.size();
 
   // If the values are build vectors, we can look through them to find
   // equivalent inputs that make the shuffles equivalent.
   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
 
   for (int i = 0; i < Size; ++i) {
     assert(Mask[i] >= -1 && "Out of bound mask element!");
     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
       if (!MaskBV || !ExpectedBV ||
           MaskBV->getOperand(Mask[i] % Size) !=
               ExpectedBV->getOperand(ExpectedMask[i] % Size))
         return false;
     }
   }
 
   return true;
 }
 
 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
 ///
 /// The masks must be exactly the same width.
 ///
 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
 ///
 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
                                       ArrayRef<int> ExpectedMask) {
   int Size = Mask.size();
   if (Size != (int)ExpectedMask.size())
     return false;
 
   for (int i = 0; i < Size; ++i)
     if (Mask[i] == SM_SentinelUndef)
       continue;
     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
       return false;
     else if (Mask[i] != ExpectedMask[i])
       return false;
 
   return true;
 }
 
 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
 // mask.
 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
                                                     const APInt &Zeroable) {
   int NumElts = Mask.size();
   assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
 
   SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
   for (int i = 0; i != NumElts; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef)
       continue;
     assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
     TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
   }
   return TargetMask;
 }
 
 // Attempt to create a shuffle mask from a VSELECT condition mask.
 static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,
                                          SDValue Cond) {
   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return false;
 
   unsigned Size = Cond.getValueType().getVectorNumElements();
   Mask.resize(Size, SM_SentinelUndef);
 
   for (int i = 0; i != (int)Size; ++i) {
     SDValue CondElt = Cond.getOperand(i);
     Mask[i] = i;
     // Arbitrarily choose from the 2nd operand if the select condition element
     // is undef.
     // TODO: Can we do better by matching patterns such as even/odd?
     if (CondElt.isUndef() || isNullConstant(CondElt))
       Mask[i] += Size;
   }
 
   return true;
 }
 
 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
 // instructions.
 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
   if (VT != MVT::v8i32 && VT != MVT::v8f32)
     return false;
 
   SmallVector<int, 8> Unpcklwd;
   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
                           /* Unary = */ false);
   SmallVector<int, 8> Unpckhwd;
   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
                           /* Unary = */ false);
   bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
                          isTargetShuffleEquivalent(Mask, Unpckhwd));
   return IsUnpackwdMask;
 }
 
 /// Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
 /// example.
 ///
 /// NB: We rely heavily on "undef" masks preserving the input lane.
 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
 
   unsigned Imm = 0;
   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
   return Imm;
 }
 
 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
                                           SelectionDAG &DAG) {
   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
 }
 
 /// Compute whether each element of a shuffle is zeroable.
 ///
 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
 /// Either it is an undef element in the shuffle mask, the element of the input
 /// referenced is undef, or the element of the input referenced is known to be
 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
 /// as many lanes with this technique as possible to simplify the remaining
 /// shuffle.
 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
                                             SDValue V1, SDValue V2) {
   APInt Zeroable(Mask.size(), 0);
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
 
   int VectorSizeInBits = V1.getValueSizeInBits();
   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
 
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     int M = Mask[i];
     // Handle the easy cases.
     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
       Zeroable.setBit(i);
       continue;
     }
 
     // Determine shuffle input and normalize the mask.
     SDValue V = M < Size ? V1 : V2;
     M %= Size;
 
     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
     if (V.getOpcode() != ISD::BUILD_VECTOR)
       continue;
 
     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
     // the (larger) source element must be UNDEF/ZERO.
     if ((Size % V.getNumOperands()) == 0) {
       int Scale = Size / V->getNumOperands();
       SDValue Op = V.getOperand(M / Scale);
       if (Op.isUndef() || X86::isZeroNode(Op))
         Zeroable.setBit(i);
       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
         APInt Val = Cst->getAPIntValue();
         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
         if (Val == 0)
           Zeroable.setBit(i);
       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
         APInt Val = Cst->getValueAPF().bitcastToAPInt();
         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
         if (Val == 0)
           Zeroable.setBit(i);
       }
       continue;
     }
 
     // If the BUILD_VECTOR has more elements then all the (smaller) source
     // elements must be UNDEF or ZERO.
     if ((V.getNumOperands() % Size) == 0) {
       int Scale = V->getNumOperands() / Size;
       bool AllZeroable = true;
       for (int j = 0; j < Scale; ++j) {
         SDValue Op = V.getOperand((M * Scale) + j);
         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
       }
       if (AllZeroable)
         Zeroable.setBit(i);
       continue;
     }
   }
 
   return Zeroable;
 }
 
 // The Shuffle result is as follow:
 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
 // Each Zeroable's element correspond to a particular Mask's element.
 // As described in computeZeroableShuffleElements function.
 //
 // The function looks for a sub-mask that the nonzero elements are in
 // increasing order. If such sub-mask exist. The function returns true.
 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
                                      ArrayRef<int> Mask, const EVT &VectorType,
                                      bool &IsZeroSideLeft) {
   int NextElement = -1;
   // Check if the Mask's nonzero elements are in increasing order.
   for (int i = 0, e = Mask.size(); i < e; i++) {
     // Checks if the mask's zeros elements are built from only zeros.
     assert(Mask[i] >= -1 && "Out of bound mask element!");
     if (Mask[i] < 0)
       return false;
     if (Zeroable[i])
       continue;
     // Find the lowest non zero element
     if (NextElement < 0) {
       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
       IsZeroSideLeft = NextElement != 0;
     }
     // Exit if the mask's non zero elements are not in increasing order.
     if (NextElement != Mask[i])
       return false;
     NextElement++;
   }
   return true;
 }
 
 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2,
                                             const APInt &Zeroable,
                                             const X86Subtarget &Subtarget,
                                             SelectionDAG &DAG) {
   int Size = Mask.size();
   int LaneSize = 128 / VT.getScalarSizeInBits();
   const int NumBytes = VT.getSizeInBits() / 8;
   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
 
   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
          (Subtarget.hasBWI() && VT.is512BitVector()));
 
   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
   // Sign bit set in i8 mask means zero element.
   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
 
   SDValue V;
   for (int i = 0; i < NumBytes; ++i) {
     int M = Mask[i / NumEltBytes];
     if (M < 0) {
       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
       continue;
     }
     if (Zeroable[i / NumEltBytes]) {
       PSHUFBMask[i] = ZeroMask;
       continue;
     }
 
     // We can only use a single input of V1 or V2.
     SDValue SrcV = (M >= Size ? V2 : V1);
     if (V && V != SrcV)
       return SDValue();
     V = SrcV;
     M %= Size;
 
     // PSHUFB can't cross lanes, ensure this doesn't happen.
     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
       return SDValue();
 
     M = M % LaneSize;
     M = M * NumEltBytes + (i % NumEltBytes);
     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
   }
   assert(V && "Failed to find a source input");
 
   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
   return DAG.getBitcast(
       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
 }
 
 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
                            const SDLoc &dl);
 
 // X86 has dedicated shuffle that can be lowered to VEXPAND
 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
                                           const APInt &Zeroable,
                                           ArrayRef<int> Mask, SDValue &V1,
                                           SDValue &V2, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget) {
   bool IsLeftZeroSide = true;
   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
                                 IsLeftZeroSide))
     return SDValue();
   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
   MVT IntegerType =
       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
   unsigned NumElts = VT.getVectorNumElements();
   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
          "Unexpected number of vector elements");
   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
                               Subtarget, DAG, DL);
   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
   return DAG.getSelect(DL, VT, VMask,
                        DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
                        ZeroVector);
 }
 
 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
                                         unsigned &UnpackOpcode, bool IsUnary,
                                         ArrayRef<int> TargetMask,
                                         const SDLoc &DL, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   int NumElts = VT.getVectorNumElements();
 
   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
   for (int i = 0; i != NumElts; i += 2) {
     int M1 = TargetMask[i + 0];
     int M2 = TargetMask[i + 1];
     Undef1 &= (SM_SentinelUndef == M1);
     Undef2 &= (SM_SentinelUndef == M2);
     Zero1 &= isUndefOrZero(M1);
     Zero2 &= isUndefOrZero(M2);
   }
   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
          "Zeroable shuffle detected");
 
   // Attempt to match the target mask against the unpack lo/hi mask patterns.
   SmallVector<int, 64> Unpckl, Unpckh;
   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
   if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
     UnpackOpcode = X86ISD::UNPCKL;
     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
     return true;
   }
 
   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
   if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
     UnpackOpcode = X86ISD::UNPCKH;
     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
     return true;
   }
 
   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
   if (IsUnary && (Zero1 || Zero2)) {
     // Don't bother if we can blend instead.
     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
       return false;
 
     bool MatchLo = true, MatchHi = true;
     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
       int M = TargetMask[i];
 
       // Ignore if the input is known to be zero or the index is undef.
       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
           (M == SM_SentinelUndef))
         continue;
 
       MatchLo &= (M == Unpckl[i]);
       MatchHi &= (M == Unpckh[i]);
     }
 
     if (MatchLo || MatchHi) {
       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
       return true;
     }
   }
 
   // If a binary shuffle, commute and try again.
   if (!IsUnary) {
     ShuffleVectorSDNode::commuteMask(Unpckl);
     if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
       UnpackOpcode = X86ISD::UNPCKL;
       std::swap(V1, V2);
       return true;
     }
 
     ShuffleVectorSDNode::commuteMask(Unpckh);
     if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
       UnpackOpcode = X86ISD::UNPCKH;
       std::swap(V1, V2);
       return true;
     }
   }
 
   return false;
 }
 
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
                                            ArrayRef<int> Mask, SDValue V1,
                                            SDValue V2, SelectionDAG &DAG) {
   SmallVector<int, 8> Unpckl;
   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
 
   SmallVector<int, 8> Unpckh;
   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
 
   // Commute and try again.
   ShuffleVectorSDNode::commuteMask(Unpckl);
   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
 
   ShuffleVectorSDNode::commuteMask(Unpckh);
   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
 
   return SDValue();
 }
 
 static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
                                          int Delta) {
   int Size = (int)Mask.size();
   int Split = Size / Delta;
   int TruncatedVectorStart = SwappedOps ? Size : 0;
 
   // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
   if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
     return false;
 
   // The rest of the mask should not refer to the truncated vector's elements.
   if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
                    TruncatedVectorStart + Size))
     return false;
 
   return true;
 }
 
 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
 //
 // An example is the following:
 //
 // t0: ch = EntryToken
 //           t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
 //         t25: v4i32 = truncate t2
 //       t41: v8i16 = bitcast t25
 //       t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
 //       Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
 //   t18: v2i64 = bitcast t51
 //
 // Without avx512vl, this is lowered to:
 //
 // vpmovqd %zmm0, %ymm0
 // vpshufb {{.*#+}} xmm0 =
 // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
 //
 // But when avx512vl is available, one can just use a single vpmovdw
 // instruction.
 static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
                                            MVT VT, SDValue V1, SDValue V2,
                                            SelectionDAG &DAG,
                                            const X86Subtarget &Subtarget) {
   if (VT != MVT::v16i8 && VT != MVT::v8i16)
     return SDValue();
 
   if (Mask.size() != VT.getVectorNumElements())
     return SDValue();
 
   bool SwappedOps = false;
 
   if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
     if (!ISD::isBuildVectorAllZeros(V1.getNode()))
       return SDValue();
 
     std::swap(V1, V2);
     SwappedOps = true;
   }
 
   // Look for:
   //
   // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
   // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
   //
   // and similar ones.
   if (V1.getOpcode() != ISD::BITCAST)
     return SDValue();
   if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
     return SDValue();
 
   SDValue Src = V1.getOperand(0).getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
 
   // The vptrunc** instructions truncating 128 bit and 256 bit vectors
   // are only available with avx512vl.
   if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
     return SDValue();
 
   // Down Convert Word to Byte is only available with avx512bw. The case with
   // 256-bit output doesn't contain a shuffle and is therefore not handled here.
   if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
       !Subtarget.hasBWI())
     return SDValue();
 
   // The first half/quarter of the mask should refer to every second/fourth
   // element of the vector truncated and bitcasted.
   if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
       !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
     return SDValue();
 
   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
 }
 
 // X86 has dedicated pack instructions that can handle specific truncation
 // operations: PACKSS and PACKUS.
 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
                                        SDValue &V2, unsigned &PackOpcode,
                                        ArrayRef<int> TargetMask,
                                        SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget) {
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BitSize = VT.getScalarSizeInBits();
   MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
   MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
 
   auto MatchPACK = [&](SDValue N1, SDValue N2) {
     SDValue VV1 = DAG.getBitcast(PackVT, N1);
     SDValue VV2 = DAG.getBitcast(PackVT, N2);
     if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
       APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
       if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
           (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
         V1 = VV1;
         V2 = VV2;
         SrcVT = PackVT;
         PackOpcode = X86ISD::PACKUS;
         return true;
       }
     }
     if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
         (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
       V1 = VV1;
       V2 = VV2;
       SrcVT = PackVT;
       PackOpcode = X86ISD::PACKSS;
       return true;
     }
     return false;
   };
 
   // Try binary shuffle.
   SmallVector<int, 32> BinaryMask;
   createPackShuffleMask(VT, BinaryMask, false);
   if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
     if (MatchPACK(V1, V2))
       return true;
 
   // Try unary shuffle.
   SmallVector<int, 32> UnaryMask;
   createPackShuffleMask(VT, UnaryMask, true);
   if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
     if (MatchPACK(V1, V1))
       return true;
 
   return false;
 }
 
 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
                                           ArrayRef<int> Mask, SDValue V1,
                                           SDValue V2, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget) {
   MVT PackVT;
   unsigned PackOpcode;
   if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
                                  Subtarget))
     return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
                        DAG.getBitcast(PackVT, V2));
 
   return SDValue();
 }
 
 /// Try to emit a bitmask instruction for a shuffle.
 ///
 /// This handles cases where we can model a blend exactly as a bitmask due to
 /// one of the inputs being zeroable.
 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
                                            const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
   MVT EltVT = VT.getVectorElementType();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   SDValue V;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Zeroable[i])
       continue;
     if (Mask[i] % Size != i)
       return SDValue(); // Not a blend.
     if (!V)
       V = Mask[i] < Size ? V1 : V2;
     else if (V != (Mask[i] < Size ? V1 : V2))
       return SDValue(); // Can only let one input through the mask.
 
     VMaskOps[i] = AllOnes;
   }
   if (!V)
     return SDValue(); // No non-zeroable elements!
 
   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
   return DAG.getNode(ISD::AND, DL, VT, V, VMask);
 }
 
 /// Try to emit a blend instruction for a shuffle using bit math.
 ///
 /// This is used as a fallback approach when first class blend instructions are
 /// unavailable. Currently it is only suitable for integer vectors, but could
 /// be generalized for floating point vectors if desirable.
 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
                                             SelectionDAG &DAG) {
   assert(VT.isInteger() && "Only supports integer vector types!");
   MVT EltVT = VT.getVectorElementType();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> MaskOps;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
       return SDValue(); // Shuffled input!
     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
   }
 
   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
   V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);
   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
 }
 
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG);
 
 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
                                       MutableArrayRef<int> TargetMask,
                                       bool &ForceV1Zero, bool &ForceV2Zero,
                                       uint64_t &BlendMask) {
   bool V1IsZeroOrUndef =
       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
   bool V2IsZeroOrUndef =
       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
 
   BlendMask = 0;
   ForceV1Zero = false, ForceV2Zero = false;
   assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
 
   // Attempt to generate the binary blend mask. If an input is zero then
   // we can use any lane.
   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
   for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
     int M = TargetMask[i];
     if (M == SM_SentinelUndef)
       continue;
     if (M == i)
       continue;
     if (M == i + Size) {
       BlendMask |= 1ull << i;
       continue;
     }
     if (M == SM_SentinelZero) {
       if (V1IsZeroOrUndef) {
         ForceV1Zero = true;
         TargetMask[i] = i;
         continue;
       }
       if (V2IsZeroOrUndef) {
         ForceV2Zero = true;
         BlendMask |= 1ull << i;
         TargetMask[i] = i + Size;
         continue;
       }
     }
     return false;
   }
   return true;
 }
 
 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
                                             int Scale) {
   uint64_t ScaledMask = 0;
   for (int i = 0; i != Size; ++i)
     if (BlendMask & (1ull << i))
       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
   return ScaledMask;
 }
 
 /// Try to emit a blend instruction for a shuffle.
 ///
 /// This doesn't do any checks for the availability of instructions for blending
 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
 /// be matched in the backend with the type given. What it does check for is
 /// that the shuffle mask is a blend, or convertible into a blend with zero.
 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Original,
                                          const APInt &Zeroable,
                                          const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
 
   uint64_t BlendMask = 0;
   bool ForceV1Zero = false, ForceV2Zero = false;
   if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
                                  BlendMask))
     return SDValue();
 
   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
   if (ForceV1Zero)
     V1 = getZeroVector(VT, Subtarget, DAG, DL);
   if (ForceV2Zero)
     V2 = getZeroVector(VT, Subtarget, DAG, DL);
 
   switch (VT.SimpleTy) {
   case MVT::v2f64:
   case MVT::v4f32:
   case MVT::v4f64:
   case MVT::v8f32:
     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
                        DAG.getConstant(BlendMask, DL, MVT::i8));
   case MVT::v4i64:
   case MVT::v8i32:
     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
     LLVM_FALLTHROUGH;
   case MVT::v2i64:
   case MVT::v4i32:
     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
     // that instruction.
     if (Subtarget.hasAVX2()) {
       // Scale the blend by the number of 32-bit dwords per element.
       int Scale =  VT.getScalarSizeInBits() / 32;
       BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
       V1 = DAG.getBitcast(BlendVT, V1);
       V2 = DAG.getBitcast(BlendVT, V2);
       return DAG.getBitcast(
           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
                           DAG.getConstant(BlendMask, DL, MVT::i8)));
     }
     LLVM_FALLTHROUGH;
   case MVT::v8i16: {
     // For integer shuffles we need to expand the mask and cast the inputs to
     // v8i16s prior to blending.
     int Scale = 8 / VT.getVectorNumElements();
     BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
     V1 = DAG.getBitcast(MVT::v8i16, V1);
     V2 = DAG.getBitcast(MVT::v8i16, V2);
     return DAG.getBitcast(VT,
                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
   }
   case MVT::v16i16: {
     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
       BlendMask = 0;
       for (int i = 0; i < 8; ++i)
         if (RepeatedMask[i] >= 8)
           BlendMask |= 1ull << i;
       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
                          DAG.getConstant(BlendMask, DL, MVT::i8));
     }
     // Use PBLENDW for lower/upper lanes and then blend lanes.
     // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
     // merge to VSELECT where useful.
     uint64_t LoMask = BlendMask & 0xFF;
     uint64_t HiMask = (BlendMask >> 8) & 0xFF;
     if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
       SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
                                DAG.getConstant(LoMask, DL, MVT::i8));
       SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
                                DAG.getConstant(HiMask, DL, MVT::i8));
       return DAG.getVectorShuffle(
           MVT::v16i16, DL, Lo, Hi,
           {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
     }
     LLVM_FALLTHROUGH;
   }
   case MVT::v16i8:
   case MVT::v32i8: {
     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
            "256-bit byte-blends require AVX2 support!");
 
     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
     if (SDValue Masked =
             lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
       return Masked;
 
     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
       MVT IntegerType =
           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
     }
 
     // Scale the blend by the number of bytes per element.
     int Scale = VT.getScalarSizeInBits() / 8;
 
     // This form of blend is always done on bytes. Compute the byte vector
     // type.
     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
 
     // x86 allows load folding with blendvb from the 2nd source operand. But
     // we are still using LLVM select here (see comment below), so that's V1.
     // If V2 can be load-folded and V1 cannot be load-folded, then commute to
     // allow that load-folding possibility.
     if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
       ShuffleVectorSDNode::commuteMask(Mask);
       std::swap(V1, V2);
     }
 
     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
     // mix of LLVM's code generator and the x86 backend. We tell the code
     // generator that boolean values in the elements of an x86 vector register
     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
     // mapping a select to operand #1, and 'false' mapping to operand #2. The
     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
     // of the element (the remaining are ignored) and 0 in that high bit would
     // mean operand #1 while 1 in the high bit would mean operand #2. So while
     // the LLVM model for boolean values in vector elements gets the relevant
     // bit set, it is set backwards and over constrained relative to x86's
     // actual model.
     SmallVector<SDValue, 32> VSELECTMask;
     for (int i = 0, Size = Mask.size(); i < Size; ++i)
       for (int j = 0; j < Scale; ++j)
         VSELECTMask.push_back(
             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
                                           MVT::i8));
 
     V1 = DAG.getBitcast(BlendVT, V1);
     V2 = DAG.getBitcast(BlendVT, V2);
     return DAG.getBitcast(
         VT,
         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
                       V1, V2));
   }
   case MVT::v16f32:
   case MVT::v8f64:
   case MVT::v8i64:
   case MVT::v16i32:
   case MVT::v32i16:
   case MVT::v64i8: {
     MVT IntegerType =
         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
   }
   default:
     llvm_unreachable("Not a supported integer vector type!");
   }
 }
 
 /// Try to lower as a blend of elements from two inputs followed by
 /// a single-input permutation.
 ///
 /// This matches the pattern where we can blend elements from two inputs and
 /// then reduce the shuffle to a single-input permutation.
 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
                                                    SDValue V1, SDValue V2,
                                                    ArrayRef<int> Mask,
                                                    SelectionDAG &DAG,
                                                    bool ImmBlends = false) {
   // We build up the blend mask while checking whether a blend is a viable way
   // to reduce the shuffle.
   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
 
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Mask[i] < 0)
       continue;
 
     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
 
     if (BlendMask[Mask[i] % Size] < 0)
       BlendMask[Mask[i] % Size] = Mask[i];
     else if (BlendMask[Mask[i] % Size] != Mask[i])
       return SDValue(); // Can't blend in the needed input!
 
     PermuteMask[i] = Mask[i] % Size;
   }
 
   // If only immediate blends, then bail if the blend mask can't be widened to
   // i16.
   unsigned EltSize = VT.getScalarSizeInBits();
   if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
     return SDValue();
 
   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
 }
 
 /// Try to lower as an unpack of elements from two inputs followed by
 /// a single-input permutation.
 ///
 /// This matches the pattern where we can unpack elements from two inputs and
 /// then reduce the shuffle to a single-input (wider) permutation.
 static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
                                                    SDValue V1, SDValue V2,
                                                    ArrayRef<int> Mask,
                                                    SelectionDAG &DAG) {
   int NumElts = Mask.size();
   int NumLanes = VT.getSizeInBits() / 128;
   int NumLaneElts = NumElts / NumLanes;
   int NumHalfLaneElts = NumLaneElts / 2;
 
   bool MatchLo = true, MatchHi = true;
   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
 
   // Determine UNPCKL/UNPCKH type and operand order.
   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
     for (int Elt = 0; Elt != NumLaneElts; ++Elt) {
       int M = Mask[Lane + Elt];
       if (M < 0)
         continue;
 
       SDValue &Op = Ops[Elt & 1];
       if (M < NumElts && (Op.isUndef() || Op == V1))
         Op = V1;
       else if (NumElts <= M && (Op.isUndef() || Op == V2))
         Op = V2;
       else
         return SDValue();
 
       int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
       MatchLo &= isUndefOrInRange(M, Lo, Mid) ||
                  isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);
       MatchHi &= isUndefOrInRange(M, Mid, Hi) ||
                  isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);
       if (!MatchLo && !MatchHi)
         return SDValue();
     }
   }
   assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
 
   // Now check that each pair of elts come from the same unpack pair
   // and set the permute mask based on each pair.
   // TODO - Investigate cases where we permute individual elements.
   SmallVector<int, 32> PermuteMask(NumElts, -1);
   for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
     for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {
       int M0 = Mask[Lane + Elt + 0];
       int M1 = Mask[Lane + Elt + 1];
       if (0 <= M0 && 0 <= M1 &&
           (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))
         return SDValue();
       if (0 <= M0)
         PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));
       if (0 <= M1)
         PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;
     }
   }
 
   unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
   SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
   return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
 }
 
 /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
 /// permuting the elements of the result in place.
 static SDValue lowerVectorShuffleAsByteRotateAndPermute(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
       (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
       (VT.is512BitVector() && !Subtarget.hasBWI()))
     return SDValue();
 
   // We don't currently support lane crossing permutes.
   if (is128BitLaneCrossingShuffleMask(VT, Mask))
     return SDValue();
 
   int Scale = VT.getScalarSizeInBits() / 8;
   int NumLanes = VT.getSizeInBits() / 128;
   int NumElts = VT.getVectorNumElements();
   int NumEltsPerLane = NumElts / NumLanes;
 
   // Determine range of mask elts.
   bool Blend1 = true;
   bool Blend2 = true;
   std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
   std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
   for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
     for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
       int M = Mask[Lane + Elt];
       if (M < 0)
         continue;
       if (M < NumElts) {
         Blend1 &= (M == (Lane + Elt));
         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
         M = M % NumEltsPerLane;
         Range1.first = std::min(Range1.first, M);
         Range1.second = std::max(Range1.second, M);
       } else {
         M -= NumElts;
         Blend2 &= (M == (Lane + Elt));
         assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
         M = M % NumEltsPerLane;
         Range2.first = std::min(Range2.first, M);
         Range2.second = std::max(Range2.second, M);
       }
     }
   }
 
   // Bail if we don't need both elements.
   // TODO - it might be worth doing this for unary shuffles if the permute
   // can be widened.
   if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
       !(0 <= Range2.first && Range2.second < NumEltsPerLane))
     return SDValue();
 
   if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
     return SDValue();
 
   // Rotate the 2 ops so we can access both ranges, then permute the result.
   auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
     SDValue Rotate = DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
                         DAG.getBitcast(ByteVT, Lo),
                         DAG.getConstant(Scale * RotAmt, DL, MVT::i8)));
     SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
     for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
       for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
         int M = Mask[Lane + Elt];
         if (M < 0)
           continue;
         if (M < NumElts)
           PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
         else
           PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
       }
     }
     return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
   };
 
   // Check if the ranges are small enough to rotate from either direction.
   if (Range2.second < Range1.first)
     return RotateAndPermute(V1, V2, Range1.first, 0);
   if (Range1.second < Range2.first)
     return RotateAndPermute(V2, V1, Range2.first, NumElts);
   return SDValue();
 }
 
 /// Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
 /// This matches the extremely common pattern for handling combined
 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
 /// operations. It will try to pick the best arrangement of shuffles and
 /// blends.
 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   // Shuffle the input elements into the desired positions in V1 and V2 and
   // blend them together.
   SmallVector<int, 32> V1Mask(Mask.size(), -1);
   SmallVector<int, 32> V2Mask(Mask.size(), -1);
   SmallVector<int, 32> BlendMask(Mask.size(), -1);
   for (int i = 0, Size = Mask.size(); i < Size; ++i)
     if (Mask[i] >= 0 && Mask[i] < Size) {
       V1Mask[i] = Mask[i];
       BlendMask[i] = i;
     } else if (Mask[i] >= Size) {
       V2Mask[i] = Mask[i] - Size;
       BlendMask[i] = i + Size;
     }
 
   // Try to lower with the simpler initial blend/unpack/rotate strategies unless
   // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
   // the shuffle may be able to fold with a load or other benefit. However, when
   // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
   // pre-shuffle first is a better strategy.
   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
     // Only prefer immediate blends to unpack/rotate.
     if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
             DL, VT, V1, V2, Mask, DAG, true))
       return BlendPerm;
     if (SDValue UnpackPerm =
             lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
       return UnpackPerm;
     if (SDValue RotatePerm = lowerVectorShuffleAsByteRotateAndPermute(
             DL, VT, V1, V2, Mask, Subtarget, DAG))
       return RotatePerm;
     // Unpack/rotate failed - try again with variable blends.
     if (SDValue BlendPerm =
             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
       return BlendPerm;
   }
 
   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
 }
 
 /// Try to lower a vector shuffle as a rotation.
 ///
 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
                                       ArrayRef<int> Mask) {
   int NumElts = Mask.size();
 
   // We need to detect various ways of spelling a rotation:
   //   [11, 12, 13, 14, 15,  0,  1,  2]
   //   [-1, 12, 13, 14, -1, -1,  1, -1]
   //   [-1, -1, -1, -1, -1, -1,  1,  2]
   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
   //   [-1,  4,  5,  6, -1, -1,  9, -1]
   //   [-1,  4,  5,  6, -1, -1, -1, -1]
   int Rotation = 0;
   SDValue Lo, Hi;
   for (int i = 0; i < NumElts; ++i) {
     int M = Mask[i];
     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
            "Unexpected mask index.");
     if (M < 0)
       continue;
 
     // Determine where a rotated vector would have started.
     int StartIdx = i - (M % NumElts);
     if (StartIdx == 0)
       // The identity rotation isn't interesting, stop.
       return -1;
 
     // If we found the tail of a vector the rotation must be the missing
     // front. If we found the head of a vector, it must be how much of the
     // head.
     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
 
     if (Rotation == 0)
       Rotation = CandidateRotation;
     else if (Rotation != CandidateRotation)
       // The rotations don't match, so we can't match this mask.
       return -1;
 
     // Compute which value this mask is pointing at.
     SDValue MaskV = M < NumElts ? V1 : V2;
 
     // Compute which of the two target values this index should be assigned
     // to. This reflects whether the high elements are remaining or the low
     // elements are remaining.
     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
 
     // Either set up this value if we've not encountered it before, or check
     // that it remains consistent.
     if (!TargetV)
       TargetV = MaskV;
     else if (TargetV != MaskV)
       // This may be a rotation, but it pulls from the inputs in some
       // unsupported interleaving.
       return -1;
   }
 
   // Check that we successfully analyzed the mask, and normalize the results.
   assert(Rotation != 0 && "Failed to locate a viable rotation!");
   assert((Lo || Hi) && "Failed to find a rotated input vector!");
   if (!Lo)
     Lo = Hi;
   else if (!Hi)
     Hi = Lo;
 
   V1 = Lo;
   V2 = Hi;
 
   return Rotation;
 }
 
 /// Try to lower a vector shuffle as a byte rotation.
 ///
 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
 /// try to generically lower a vector shuffle through such an pattern. It
 /// does not check for the profitability of lowering either as PALIGNR or
 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
 /// This matches shuffle vectors that look like:
 ///
 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
 ///
 /// Essentially it concatenates V1 and V2, shifts right by some number of
 /// elements, and takes the low elements as the result. Note that while this is
 /// specified as a *right shift* because x86 is little-endian, it is a *left
 /// rotate* of the vector lanes.
 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
                                           ArrayRef<int> Mask) {
   // Don't accept any shuffles with zero elements.
   if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
     return -1;
 
   // PALIGNR works on 128-bit lanes.
   SmallVector<int, 16> RepeatedMask;
   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
     return -1;
 
   int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
   if (Rotation <= 0)
     return -1;
 
   // PALIGNR rotates bytes, so we need to scale the
   // rotation based on how many bytes are in the vector lane.
   int NumElts = RepeatedMask.size();
   int Scale = 16 / NumElts;
   return Rotation * Scale;
 }
 
 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
                                               SDValue V1, SDValue V2,
                                               ArrayRef<int> Mask,
                                               const X86Subtarget &Subtarget,
                                               SelectionDAG &DAG) {
   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
 
   SDValue Lo = V1, Hi = V2;
   int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
   if (ByteRotation <= 0)
     return SDValue();
 
   // Cast the inputs to i8 vector of correct length to match PALIGNR or
   // PSLLDQ/PSRLDQ.
   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
   Lo = DAG.getBitcast(ByteVT, Lo);
   Hi = DAG.getBitcast(ByteVT, Hi);
 
   // SSSE3 targets can use the palignr instruction.
   if (Subtarget.hasSSSE3()) {
     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
            "512-bit PALIGNR requires BWI instructions");
     return DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
                         DAG.getConstant(ByteRotation, DL, MVT::i8)));
   }
 
   assert(VT.is128BitVector() &&
          "Rotate-based lowering only supports 128-bit lowering!");
   assert(Mask.size() <= 16 &&
          "Can shuffle at most 16 bytes in a 128-bit vector!");
   assert(ByteVT == MVT::v16i8 &&
          "SSE2 rotate lowering only needed for v16i8!");
 
   // Default SSE2 implementation
   int LoByteShift = 16 - ByteRotation;
   int HiByteShift = ByteRotation;
 
   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
   return DAG.getBitcast(VT,
                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
 }
 
 /// Try to lower a vector shuffle as a dword/qword rotation.
 ///
 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
 /// rotation of the concatenation of two vectors; This routine will
 /// try to generically lower a vector shuffle through such an pattern.
 ///
 /// Essentially it concatenates V1 and V2, shifts right by some number of
 /// elements, and takes the low elements as the result. Note that while this is
 /// specified as a *right shift* because x86 is little-endian, it is a *left
 /// rotate* of the vector lanes.
 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
                                           SDValue V1, SDValue V2,
                                           ArrayRef<int> Mask,
                                           const X86Subtarget &Subtarget,
                                           SelectionDAG &DAG) {
   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
          "Only 32-bit and 64-bit elements are supported!");
 
   // 128/256-bit vectors are only supported with VLX.
   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
          && "VLX required for 128/256-bit vectors");
 
   SDValue Lo = V1, Hi = V2;
   int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
   if (Rotation <= 0)
     return SDValue();
 
   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
                      DAG.getConstant(Rotation, DL, MVT::i8));
 }
 
 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
 ///
 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
 /// matches elements from one of the input vectors shuffled to the left or
 /// right with zeroable elements 'shifted in'. It handles both the strictly
 /// bit-wise element shifts and the byte shift across an entire 128-bit double
 /// quad word lane.
 ///
 /// PSHL : (little-endian) left bit shift.
 /// [ zz, 0, zz,  2 ]
 /// [ -1, 4, zz, -1 ]
 /// PSRL : (little-endian) right bit shift.
 /// [  1, zz,  3, zz]
 /// [ -1, -1,  7, zz]
 /// PSLLDQ : (little-endian) left byte shift
 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
 /// PSRLDQ : (little-endian) right byte shift
 /// [  5, 6,  7, zz, zz, zz, zz, zz]
 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
 /// [  1, 2, -1, -1, -1, -1, zz, zz]
 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
                                      unsigned ScalarSizeInBits,
                                      ArrayRef<int> Mask, int MaskOffset,
                                      const APInt &Zeroable,
                                      const X86Subtarget &Subtarget) {
   int Size = Mask.size();
   unsigned SizeInBits = Size * ScalarSizeInBits;
 
   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
     for (int i = 0; i < Size; i += Scale)
       for (int j = 0; j < Shift; ++j)
         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
           return false;
 
     return true;
   };
 
   auto MatchShift = [&](int Shift, int Scale, bool Left) {
     for (int i = 0; i != Size; i += Scale) {
       unsigned Pos = Left ? i + Shift : i;
       unsigned Low = Left ? i : i + Shift;
       unsigned Len = Scale - Shift;
       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
         return -1;
     }
 
     int ShiftEltBits = ScalarSizeInBits * Scale;
     bool ByteShift = ShiftEltBits > 64;
     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
 
     // Normalize the scale for byte shifts to still produce an i64 element
     // type.
     Scale = ByteShift ? Scale / 2 : Scale;
 
     // We need to round trip through the appropriate type for the shift.
     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
     return (int)ShiftAmt;
   };
 
   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
   // keep doubling the size of the integer elements up to that. We can
   // then shift the elements of the integer vector by whole multiples of
   // their width within the elements of the larger integer vector. Test each
   // multiple to see if we can find a match with the moved element indices
   // and that the shifted in elements are all zeroable.
   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
     for (int Shift = 1; Shift != Scale; ++Shift)
       for (bool Left : {true, false})
         if (CheckZeros(Shift, Scale, Left)) {
           int ShiftAmt = MatchShift(Shift, Scale, Left);
           if (0 < ShiftAmt)
             return ShiftAmt;
         }
 
   // no match
   return -1;
 }
 
 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
                                          const APInt &Zeroable,
                                          const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   int Size = Mask.size();
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
 
   MVT ShiftVT;
   SDValue V = V1;
   unsigned Opcode;
 
   // Try to match shuffle against V1 shift.
   int ShiftAmt = matchVectorShuffleAsShift(
       ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
 
   // If V1 failed, try to match shuffle against V2 shift.
   if (ShiftAmt < 0) {
     ShiftAmt =
         matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
                                   Mask, Size, Zeroable, Subtarget);
     V = V2;
   }
 
   if (ShiftAmt < 0)
     return SDValue();
 
   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
          "Illegal integer vector type");
   V = DAG.getBitcast(ShiftVT, V);
   V = DAG.getNode(Opcode, DL, ShiftVT, V,
                   DAG.getConstant(ShiftAmt, DL, MVT::i8));
   return DAG.getBitcast(VT, V);
 }
 
 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
 // Remainder of lower half result is zero and upper half is all undef.
 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
                                       ArrayRef<int> Mask, uint64_t &BitLen,
                                       uint64_t &BitIdx, const APInt &Zeroable) {
   int Size = Mask.size();
   int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
 
   // Upper half must be undefined.
   if (!isUndefInRange(Mask, HalfSize, HalfSize))
     return false;
 
   // Determine the extraction length from the part of the
   // lower half that isn't zeroable.
   int Len = HalfSize;
   for (; Len > 0; --Len)
     if (!Zeroable[Len - 1])
       break;
   assert(Len > 0 && "Zeroable shuffle mask");
 
   // Attempt to match first Len sequential elements from the lower half.
   SDValue Src;
   int Idx = -1;
   for (int i = 0; i != Len; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef)
       continue;
     SDValue &V = (M < Size ? V1 : V2);
     M = M % Size;
 
     // The extracted elements must start at a valid index and all mask
     // elements must be in the lower half.
     if (i > M || M >= HalfSize)
       return false;
 
     if (Idx < 0 || (Src == V && Idx == (M - i))) {
       Src = V;
       Idx = M - i;
       continue;
     }
     return false;
   }
 
   if (!Src || Idx < 0)
     return false;
 
   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
   V1 = Src;
   return true;
 }
 
 // INSERTQ: Extract lowest Len elements from lower half of second source and
 // insert over first source, starting at Idx.
 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
                                         ArrayRef<int> Mask, uint64_t &BitLen,
                                         uint64_t &BitIdx) {
   int Size = Mask.size();
   int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
 
   // Upper half must be undefined.
   if (!isUndefInRange(Mask, HalfSize, HalfSize))
     return false;
 
   for (int Idx = 0; Idx != HalfSize; ++Idx) {
     SDValue Base;
 
     // Attempt to match first source from mask before insertion point.
     if (isUndefInRange(Mask, 0, Idx)) {
       /* EMPTY */
     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
       Base = V1;
     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
       Base = V2;
     } else {
       continue;
     }
 
     // Extend the extraction length looking to match both the insertion of
     // the second source and the remaining elements of the first.
     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
       SDValue Insert;
       int Len = Hi - Idx;
 
       // Match insertion.
       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
         Insert = V1;
       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
         Insert = V2;
       } else {
         continue;
       }
 
       // Match the remaining elements of the lower half.
       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
         /* EMPTY */
       } else if ((!Base || (Base == V1)) &&
                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
         Base = V1;
       } else if ((!Base || (Base == V2)) &&
                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
                                             Size + Hi)) {
         Base = V2;
       } else {
         continue;
       }
 
       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
       V1 = Base;
       V2 = Insert;
       return true;
     }
   }
 
   return false;
 }
 
 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
                                            const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   uint64_t BitLen, BitIdx;
   if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
                        DAG.getConstant(BitLen, DL, MVT::i8),
                        DAG.getConstant(BitIdx, DL, MVT::i8));
 
   if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
                        V2 ? V2 : DAG.getUNDEF(VT),
                        DAG.getConstant(BitLen, DL, MVT::i8),
                        DAG.getConstant(BitIdx, DL, MVT::i8));
 
   return SDValue();
 }
 
 /// Lower a vector shuffle as a zero or any extension.
 ///
 /// Given a specific number of elements, element bit width, and extension
 /// stride, produce either a zero or any extension based on the available
 /// features of the subtarget. The extended elements are consecutive and
 /// begin and can start from an offsetted element index in the input; to
 /// avoid excess shuffling the offset must either being in the bottom lane
 /// or at the start of a higher lane. All extended elements must be from
 /// the same lane.
 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(Scale > 1 && "Need a scale to extend.");
   int EltBits = VT.getScalarSizeInBits();
   int NumElements = VT.getVectorNumElements();
   int NumEltsPerLane = 128 / EltBits;
   int OffsetLane = Offset / NumEltsPerLane;
   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
          "Only 8, 16, and 32 bit elements can be extended.");
   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
   assert(0 <= Offset && "Extension offset must be positive.");
   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
          "Extension offset must be in the first lane or start an upper lane.");
 
   // Check that an index is in same lane as the base offset.
   auto SafeOffset = [&](int Idx) {
     return OffsetLane == (Idx / NumEltsPerLane);
   };
 
   // Shift along an input so that the offset base moves to the first element.
   auto ShuffleOffset = [&](SDValue V) {
     if (!Offset)
       return V;
 
     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
     for (int i = 0; i * Scale < NumElements; ++i) {
       int SrcIdx = i + Offset;
       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
     }
     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
   };
 
   // Found a valid zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
   if (Subtarget.hasSSE41()) {
     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
     // PUNPCK will catch this in a later shuffle match.
     if (Offset && Scale == 2 && VT.is128BitVector())
       return SDValue();
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
     InputV = ShuffleOffset(InputV);
     InputV = getExtendInVec(/*Signed*/false, DL, ExtVT, InputV, DAG);
     return DAG.getBitcast(VT, InputV);
   }
 
   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
 
   // For any extends we can cheat for larger element sizes and use shuffle
   // instructions that can fold with a load and/or copy.
   if (AnyExt && EltBits == 32) {
     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
                          -1};
     return DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                         DAG.getBitcast(MVT::v4i32, InputV),
                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
   }
   if (AnyExt && EltBits == 16 && Scale > 2) {
     int PSHUFDMask[4] = {Offset / 2, -1,
                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
                          DAG.getBitcast(MVT::v4i32, InputV),
                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
     int PSHUFWMask[4] = {1, -1, -1, -1};
     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
     return DAG.getBitcast(
         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
                         DAG.getBitcast(MVT::v8i16, InputV),
                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
   }
 
   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
   // to 64-bits.
   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
     assert(VT.is128BitVector() && "Unexpected vector width!");
 
     int LoIdx = Offset * EltBits;
     SDValue Lo = DAG.getBitcast(
         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
                                 DAG.getConstant(EltBits, DL, MVT::i8),
                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
 
     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
         !SafeOffset(Offset + 1))
       return DAG.getBitcast(VT, Lo);
 
     int HiIdx = (Offset + 1) * EltBits;
     SDValue Hi = DAG.getBitcast(
         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
                                 DAG.getConstant(EltBits, DL, MVT::i8),
                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
     return DAG.getBitcast(VT,
                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
   }
 
   // If this would require more than 2 unpack instructions to expand, use
   // pshufb when available. We can only use more than 2 unpack instructions
   // when zero extending i8 elements which also makes it easier to use pshufb.
   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
     assert(NumElements == 16 && "Unexpected byte vector width!");
     SDValue PSHUFBMask[16];
     for (int i = 0; i < 16; ++i) {
       int Idx = Offset + (i / Scale);
       PSHUFBMask[i] = DAG.getConstant(
           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
     }
     InputV = DAG.getBitcast(MVT::v16i8, InputV);
     return DAG.getBitcast(
         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
   }
 
   // If we are extending from an offset, ensure we start on a boundary that
   // we can unpack from.
   int AlignToUnpack = Offset % (NumElements / Scale);
   if (AlignToUnpack) {
     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
     for (int i = AlignToUnpack; i < NumElements; ++i)
       ShMask[i - AlignToUnpack] = i;
     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
     Offset -= AlignToUnpack;
   }
 
   // Otherwise emit a sequence of unpacks.
   do {
     unsigned UnpackLoHi = X86ISD::UNPCKL;
     if (Offset >= (NumElements / 2)) {
       UnpackLoHi = X86ISD::UNPCKH;
       Offset -= (NumElements / 2);
     }
 
     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
                          : getZeroVector(InputVT, Subtarget, DAG, DL);
     InputV = DAG.getBitcast(InputVT, InputV);
     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
     Scale /= 2;
     EltBits *= 2;
     NumElements /= 2;
   } while (Scale > 1);
   return DAG.getBitcast(VT, InputV);
 }
 
 /// Try to lower a vector shuffle as a zero extension on any microarch.
 ///
 /// This routine will try to do everything in its power to cleverly lower
 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
 /// check for the profitability of this lowering,  it tries to aggressively
 /// match this pattern. It will use all of the micro-architectural details it
 /// can to emit an efficient lowering. It handles both blends with all-zero
 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
 /// masking out later).
 ///
 /// The reason we have dedicated lowering for zext-style shuffles is that they
 /// are both incredibly common and often quite performance sensitive.
 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   int Bits = VT.getSizeInBits();
   int NumLanes = Bits / 128;
   int NumElements = VT.getVectorNumElements();
   int NumEltsPerLane = NumElements / NumLanes;
   assert(VT.getScalarSizeInBits() <= 32 &&
          "Exceeds 32-bit integer zero extension limit");
   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
 
   // Define a helper function to check a particular ext-scale and lower to it if
   // valid.
   auto Lower = [&](int Scale) -> SDValue {
     SDValue InputV;
     bool AnyExt = true;
     int Offset = 0;
     int Matches = 0;
     for (int i = 0; i < NumElements; ++i) {
       int M = Mask[i];
       if (M < 0)
         continue; // Valid anywhere but doesn't tell us anything.
       if (i % Scale != 0) {
         // Each of the extended elements need to be zeroable.
         if (!Zeroable[i])
           return SDValue();
 
         // We no longer are in the anyext case.
         AnyExt = false;
         continue;
       }
 
       // Each of the base elements needs to be consecutive indices into the
       // same input vector.
       SDValue V = M < NumElements ? V1 : V2;
       M = M % NumElements;
       if (!InputV) {
         InputV = V;
         Offset = M - (i / Scale);
       } else if (InputV != V)
         return SDValue(); // Flip-flopping inputs.
 
       // Offset must start in the lowest 128-bit lane or at the start of an
       // upper lane.
       // FIXME: Is it ever worth allowing a negative base offset?
       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
             (Offset % NumEltsPerLane) == 0))
         return SDValue();
 
       // If we are offsetting, all referenced entries must come from the same
       // lane.
       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
         return SDValue();
 
       if ((M % NumElements) != (Offset + (i / Scale)))
         return SDValue(); // Non-consecutive strided elements.
       Matches++;
     }
 
     // If we fail to find an input, we have a zero-shuffle which should always
     // have already been handled.
     // FIXME: Maybe handle this here in case during blending we end up with one?
     if (!InputV)
       return SDValue();
 
     // If we are offsetting, don't extend if we only match a single input, we
     // can always do better by using a basic PSHUF or PUNPCK.
     if (Offset != 0 && Matches < 2)
       return SDValue();
 
     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
   };
 
   // The widest scale possible for extending is to a 64-bit integer.
   assert(Bits % 64 == 0 &&
          "The number of bits in a vector must be divisible by 64 on x86!");
   int NumExtElements = Bits / 64;
 
   // Each iteration, try extending the elements half as much, but into twice as
   // many elements.
   for (; NumExtElements < NumElements; NumExtElements *= 2) {
     assert(NumElements % NumExtElements == 0 &&
            "The input vector size must be divisible by the extended size.");
     if (SDValue V = Lower(NumElements / NumExtElements))
       return V;
   }
 
   // General extends failed, but 128-bit vectors may be able to use MOVQ.
   if (Bits != 128)
     return SDValue();
 
   // Returns one of the source operands if the shuffle can be reduced to a
   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
   auto CanZExtLowHalf = [&]() {
     for (int i = NumElements / 2; i != NumElements; ++i)
       if (!Zeroable[i])
         return SDValue();
     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
       return V1;
     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
       return V2;
     return SDValue();
   };
 
   if (SDValue V = CanZExtLowHalf()) {
     V = DAG.getBitcast(MVT::v2i64, V);
     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
     return DAG.getBitcast(VT, V);
   }
 
   // No viable ext lowering found.
   return SDValue();
 }
 
 /// Try to get a scalar value for a specific element of a vector.
 ///
 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
                                               SelectionDAG &DAG) {
   MVT VT = V.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   V = peekThroughBitcasts(V);
 
   // If the bitcasts shift the element size, we can't extract an equivalent
   // element from it.
   MVT NewVT = V.getSimpleValueType();
   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
     return SDValue();
 
   if (V.getOpcode() == ISD::BUILD_VECTOR ||
       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
     // Ensure the scalar operand is the same size as the destination.
     // FIXME: Add support for scalar truncation where possible.
     SDValue S = V.getOperand(Idx);
     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
       return DAG.getBitcast(EltVT, S);
   }
 
   return SDValue();
 }
 
 /// Helper to test for a load that can be folded with x86 shuffles.
 ///
 /// This is particularly important because the set of instructions varies
 /// significantly based on whether the operand is a load or not.
 static bool isShuffleFoldableLoad(SDValue V) {
   V = peekThroughBitcasts(V);
   return ISD::isNON_EXTLoad(V.getNode());
 }
 
 /// Try to lower insertion of a single element into a zero vector.
 ///
 /// This is a common pattern that we have especially efficient patterns to lower
 /// across all subtarget feature sets.
 static SDValue lowerVectorShuffleAsElementInsertion(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
 
   int V2Index =
       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
       Mask.begin();
   bool IsV1Zeroable = true;
   for (int i = 0, Size = Mask.size(); i < Size; ++i)
     if (i != V2Index && !Zeroable[i]) {
       IsV1Zeroable = false;
       break;
     }
 
   // Check for a single input from a SCALAR_TO_VECTOR node.
   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
   // all the smarts here sunk into that routine. However, the current
   // lowering of BUILD_VECTOR makes that nearly impossible until the old
   // vector shuffle lowering is dead.
   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
                                                DAG);
   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
     // We need to zext the scalar if it is smaller than an i32.
     V2S = DAG.getBitcast(EltVT, V2S);
     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
       // Using zext to expand a narrow element won't work for non-zero
       // insertions.
       if (!IsV1Zeroable)
         return SDValue();
 
       // Zero-extend directly to i32.
       ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
     }
     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
              EltVT == MVT::i16) {
     // Either not inserting from the low element of the input or the input
     // element size is too small to use VZEXT_MOVL to clear the high bits.
     return SDValue();
   }
 
   if (!IsV1Zeroable) {
     // If V1 can't be treated as a zero vector we have fewer options to lower
     // this. We can't support integer vectors or non-zero targets cheaply, and
     // the V1 elements can't be permuted in any way.
     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
     if (!VT.isFloatingPoint() || V2Index != 0)
       return SDValue();
     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
     V1Mask[V2Index] = -1;
     if (!isNoopShuffleMask(V1Mask))
       return SDValue();
     if (!VT.is128BitVector())
       return SDValue();
 
     // Otherwise, use MOVSD or MOVSS.
     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
            "Only two types of floating point element types to handle!");
     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
                        ExtVT, V1, V2);
   }
 
   // This lowering only works for the low element with floating point vectors.
   if (VT.isFloatingPoint() && V2Index != 0)
     return SDValue();
 
   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
   if (ExtVT != VT)
     V2 = DAG.getBitcast(VT, V2);
 
   if (V2Index != 0) {
     // If we have 4 or fewer lanes we can cheaply shuffle the element into
     // the desired position. Otherwise it is more efficient to do a vector
     // shift left. We know that we can do a vector shift left because all
     // the inputs are zero.
     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
       V2Shuffle[V2Index] = 0;
       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
     } else {
       V2 = DAG.getBitcast(MVT::v16i8, V2);
       V2 = DAG.getNode(
           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
       V2 = DAG.getBitcast(VT, V2);
     }
   }
   return V2;
 }
 
 /// Try to lower broadcast of a single - truncated - integer element,
 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
 ///
 /// This assumes we have AVX2.
 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
                                                   SDValue V0, int BroadcastIdx,
                                                   const X86Subtarget &Subtarget,
                                                   SelectionDAG &DAG) {
   assert(Subtarget.hasAVX2() &&
          "We can only lower integer broadcasts with AVX2!");
 
   EVT EltVT = VT.getVectorElementType();
   EVT V0VT = V0.getValueType();
 
   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
 
   EVT V0EltVT = V0VT.getVectorElementType();
   if (!V0EltVT.isInteger())
     return SDValue();
 
   const unsigned EltSize = EltVT.getSizeInBits();
   const unsigned V0EltSize = V0EltVT.getSizeInBits();
 
   // This is only a truncation if the original element type is larger.
   if (V0EltSize <= EltSize)
     return SDValue();
 
   assert(((V0EltSize % EltSize) == 0) &&
          "Scalar type sizes must all be powers of 2 on x86!");
 
   const unsigned V0Opc = V0.getOpcode();
   const unsigned Scale = V0EltSize / EltSize;
   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
 
   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
       V0Opc != ISD::BUILD_VECTOR)
     return SDValue();
 
   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
 
   // If we're extracting non-least-significant bits, shift so we can truncate.
   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
   if (const int OffsetIdx = BroadcastIdx % Scale)
     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
                          DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
 
   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
 }
 
 /// Try to lower broadcast of a single element.
 ///
 /// For convenience, this code also bundles all of the subtarget feature set
 /// filtering. While a little annoying to re-dispatch on type here, there isn't
 /// a convenient way to factor it out.
 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
                                              SDValue V1, SDValue V2,
                                              ArrayRef<int> Mask,
                                              const X86Subtarget &Subtarget,
                                              SelectionDAG &DAG) {
   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
         (Subtarget.hasAVX2() && VT.isInteger())))
     return SDValue();
 
   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
   // we can only broadcast from a register with AVX2.
   unsigned NumElts = Mask.size();
   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
                         ? X86ISD::MOVDDUP
                         : X86ISD::VBROADCAST;
   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
 
   // Check that the mask is a broadcast.
   int BroadcastIdx = -1;
   for (int i = 0; i != (int)NumElts; ++i) {
     SmallVector<int, 8> BroadcastMask(NumElts, i);
     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
       BroadcastIdx = i;
       break;
     }
   }
 
   if (BroadcastIdx < 0)
     return SDValue();
   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
                                             "a sorted mask where the broadcast "
                                             "comes from V1.");
 
   // Go up the chain of (vector) values to find a scalar load that we can
   // combine with the broadcast.
   SDValue V = V1;
   for (;;) {
     switch (V.getOpcode()) {
     case ISD::BITCAST: {
       // Peek through bitcasts as long as BroadcastIdx can be adjusted.
       SDValue VSrc = V.getOperand(0);
       unsigned NumEltBits = V.getScalarValueSizeInBits();
       unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
       if ((NumEltBits % NumSrcBits) == 0)
         BroadcastIdx *= (NumEltBits / NumSrcBits);
       else if ((NumSrcBits % NumEltBits) == 0 &&
                (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
         BroadcastIdx /= (NumSrcBits / NumEltBits);
       else
         break;
       V = VSrc;
       continue;
     }
     case ISD::CONCAT_VECTORS: {
       int OperandSize =
           V.getOperand(0).getSimpleValueType().getVectorNumElements();
       V = V.getOperand(BroadcastIdx / OperandSize);
       BroadcastIdx %= OperandSize;
       continue;
     }
     case ISD::INSERT_SUBVECTOR: {
       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
       if (!ConstantIdx)
         break;
 
       int BeginIdx = (int)ConstantIdx->getZExtValue();
       int EndIdx =
           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
         BroadcastIdx -= BeginIdx;
         V = VInner;
       } else {
         V = VOuter;
       }
       continue;
     }
     }
     break;
   }
 
   // Ensure the source vector and BroadcastIdx are for a suitable type.
   if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
     unsigned NumEltBits = VT.getScalarSizeInBits();
     unsigned NumSrcBits = V.getScalarValueSizeInBits();
     if ((NumSrcBits % NumEltBits) == 0)
       BroadcastIdx *= (NumSrcBits / NumEltBits);
     else if ((NumEltBits % NumSrcBits) == 0 &&
              (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
       BroadcastIdx /= (NumEltBits / NumSrcBits);
     else
       return SDValue();
 
     unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
     MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
     V = DAG.getBitcast(SrcVT, V);
   }
 
   // Check if this is a broadcast of a scalar. We special case lowering
   // for scalars so that we can more effectively fold with loads.
   // First, look through bitcast: if the original value has a larger element
   // type than the shuffle, the broadcast element is in essence truncated.
   // Make that explicit to ease folding.
   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
       return TruncBroadcast;
 
   MVT BroadcastVT = VT;
 
   // Peek through any bitcast (only useful for loads).
   SDValue BC = peekThroughBitcasts(V);
 
   // Also check the simpler case, where we can directly reuse the scalar.
   if ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
     V = V.getOperand(BroadcastIdx);
 
     // If we can't broadcast from a register, check that the input is a load.
     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
       return SDValue();
   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
       Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
                    ? X86ISD::MOVDDUP
                    : Opcode;
     }
 
     // If we are broadcasting a load that is only used by the shuffle
     // then we can reduce the vector load to the broadcasted scalar load.
     LoadSDNode *Ld = cast<LoadSDNode>(BC);
     SDValue BaseAddr = Ld->getOperand(1);
     EVT SVT = BroadcastVT.getScalarType();
     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
                     DAG.getMachineFunction().getMachineMemOperand(
                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
     DAG.makeEquivalentMemoryOrdering(Ld, V);
   } else if (!BroadcastFromReg) {
     // We can't broadcast from a vector register.
     return SDValue();
   } else if (BroadcastIdx != 0) {
     // We can only broadcast from the zero-element of a vector register,
     // but it can be advantageous to broadcast from the zero-element of a
     // subvector.
     if (!VT.is256BitVector() && !VT.is512BitVector())
       return SDValue();
 
     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
     if (VT == MVT::v4f64 || VT == MVT::v4i64)
       return SDValue();
 
     // Only broadcast the zero-element of a 128-bit subvector.
     unsigned EltSize = VT.getScalarSizeInBits();
     if (((BroadcastIdx * EltSize) % 128) != 0)
       return SDValue();
 
     // The shuffle input might have been a bitcast we looked through; look at
     // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
     // later bitcast it to BroadcastVT.
     assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
            "Unexpected vector element size");
     assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
            "Unexpected vector size");
     V = extract128BitVector(V, BroadcastIdx, DAG, DL);
   }
 
   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
                     DAG.getBitcast(MVT::f64, V));
 
   // Bitcast back to the same scalar type as BroadcastVT.
   MVT SrcVT = V.getSimpleValueType();
   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
            "Unexpected vector element size");
     if (SrcVT.isVector()) {
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
     } else {
       SrcVT = BroadcastVT.getScalarType();
     }
     V = DAG.getBitcast(SrcVT, V);
   }
 
   // 32-bit targets need to load i64 as a f64 and then bitcast the result.
   if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
     V = DAG.getBitcast(MVT::f64, V);
     unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
   }
 
   // We only support broadcasting from 128-bit vectors to minimize the
   // number of patterns we need to deal with in isel. So extract down to
   // 128-bits, removing as many bitcasts as possible.
   if (SrcVT.getSizeInBits() > 128) {
     MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
                                  128 / SrcVT.getScalarSizeInBits());
     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
     V = DAG.getBitcast(ExtVT, V);
   }
 
   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
 }
 
 // Check for whether we can use INSERTPS to perform the shuffle. We only use
 // INSERTPS when the V1 elements are already in the correct locations
 // because otherwise we can just always use two SHUFPS instructions which
 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
 // perform INSERTPS if a single V1 element is out of place and all V2
 // elements are zeroable.
 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
                                          unsigned &InsertPSMask,
                                          const APInt &Zeroable,
                                          ArrayRef<int> Mask,
                                          SelectionDAG &DAG) {
   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   // Attempt to match INSERTPS with one element from VA or VB being
   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
   // are updated.
   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
                              ArrayRef<int> CandidateMask) {
     unsigned ZMask = 0;
     int VADstIndex = -1;
     int VBDstIndex = -1;
     bool VAUsedInPlace = false;
 
     for (int i = 0; i < 4; ++i) {
       // Synthesize a zero mask from the zeroable elements (includes undefs).
       if (Zeroable[i]) {
         ZMask |= 1 << i;
         continue;
       }
 
       // Flag if we use any VA inputs in place.
       if (i == CandidateMask[i]) {
         VAUsedInPlace = true;
         continue;
       }
 
       // We can only insert a single non-zeroable element.
       if (VADstIndex >= 0 || VBDstIndex >= 0)
         return false;
 
       if (CandidateMask[i] < 4) {
         // VA input out of place for insertion.
         VADstIndex = i;
       } else {
         // VB input for insertion.
         VBDstIndex = i;
       }
     }
 
     // Don't bother if we have no (non-zeroable) element for insertion.
     if (VADstIndex < 0 && VBDstIndex < 0)
       return false;
 
     // Determine element insertion src/dst indices. The src index is from the
     // start of the inserted vector, not the start of the concatenated vector.
     unsigned VBSrcIndex = 0;
     if (VADstIndex >= 0) {
       // If we have a VA input out of place, we use VA as the V2 element
       // insertion and don't use the original V2 at all.
       VBSrcIndex = CandidateMask[VADstIndex];
       VBDstIndex = VADstIndex;
       VB = VA;
     } else {
       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
     }
 
     // If no V1 inputs are used in place, then the result is created only from
     // the zero mask and the V2 insertion - so remove V1 dependency.
     if (!VAUsedInPlace)
       VA = DAG.getUNDEF(MVT::v4f32);
 
     // Update V1, V2 and InsertPSMask accordingly.
     V1 = VA;
     V2 = VB;
 
     // Insert the V2 element into the desired position.
     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
     return true;
   };
 
   if (matchAsInsertPS(V1, V2, Mask))
     return true;
 
   // Commute and try again.
   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
   ShuffleVectorSDNode::commuteMask(CommutedMask);
   if (matchAsInsertPS(V2, V1, CommutedMask))
     return true;
 
   return false;
 }
 
 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
                                             const APInt &Zeroable,
                                             SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
 
   // Attempt to match the insertps pattern.
   unsigned InsertPSMask;
   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
     return SDValue();
 
   // Insert the V2 element into the desired position.
   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
 }
 
 /// Try to lower a shuffle as a permute of the inputs followed by an
 /// UNPCK instruction.
 ///
 /// This specifically targets cases where we end up with alternating between
 /// the two inputs, and so can permute them into something that feeds a single
 /// UNPCK instruction. Note that this routine only targets integer vectors
 /// because for floating point vectors we have a generalized SHUFPS lowering
 /// strategy that handles everything that doesn't *exactly* match an unpack,
 /// making this clever lowering unnecessary.
 static SDValue lowerVectorShuffleAsPermuteAndUnpack(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() &&
          "This routine only supports integer vectors.");
   assert(VT.is128BitVector() &&
          "This routine only works on 128-bit vectors.");
   assert(!V2.isUndef() &&
          "This routine should only be used when blending two inputs.");
   assert(Mask.size() >= 2 && "Single element masks are invalid.");
 
   int Size = Mask.size();
 
   int NumLoInputs =
       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
   int NumHiInputs =
       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
 
   bool UnpackLo = NumLoInputs >= NumHiInputs;
 
   auto TryUnpack = [&](int ScalarSize, int Scale) {
     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
 
     for (int i = 0; i < Size; ++i) {
       if (Mask[i] < 0)
         continue;
 
       // Each element of the unpack contains Scale elements from this mask.
       int UnpackIdx = i / Scale;
 
       // We only handle the case where V1 feeds the first slots of the unpack.
       // We rely on canonicalization to ensure this is the case.
       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
         return SDValue();
 
       // Setup the mask for this input. The indexing is tricky as we have to
       // handle the unpack stride.
       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
           Mask[i] % Size;
     }
 
     // If we will have to shuffle both inputs to use the unpack, check whether
     // we can just unpack first and shuffle the result. If so, skip this unpack.
     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
         !isNoopShuffleMask(V2Mask))
       return SDValue();
 
     // Shuffle the inputs into place.
     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
 
     // Cast the inputs to the type we will use to unpack them.
     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
     V1 = DAG.getBitcast(UnpackVT, V1);
     V2 = DAG.getBitcast(UnpackVT, V2);
 
     // Unpack the inputs and cast the result back to the desired type.
     return DAG.getBitcast(
         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
                         UnpackVT, V1, V2));
   };
 
   // We try each unpack from the largest to the smallest to try and find one
   // that fits this mask.
   int OrigScalarSize = VT.getScalarSizeInBits();
   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
       return Unpack;
 
   // If we're shuffling with a zero vector then we're better off not doing
   // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
   if (ISD::isBuildVectorAllZeros(V1.getNode()) ||
       ISD::isBuildVectorAllZeros(V2.getNode()))
     return SDValue();
 
   // If none of the unpack-rooted lowerings worked (or were profitable) try an
   // initial unpack.
   if (NumLoInputs == 0 || NumHiInputs == 0) {
     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
            "We have to have *some* inputs!");
     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
 
     // FIXME: We could consider the total complexity of the permute of each
     // possible unpacking. Or at the least we should consider how many
     // half-crossings are created.
     // FIXME: We could consider commuting the unpacks.
 
     SmallVector<int, 32> PermMask((unsigned)Size, -1);
     for (int i = 0; i < Size; ++i) {
       if (Mask[i] < 0)
         continue;
 
       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
 
       PermMask[i] =
           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
     }
     return DAG.getVectorShuffle(
         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
                             DL, VT, V1, V2),
         DAG.getUNDEF(VT), PermMask);
   }
 
   return SDValue();
 }
 
 /// Handle lowering of 2-lane 64-bit floating point shuffles.
 ///
 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
 /// support for floating point shuffles but not integer shuffles. These
 /// instructions will incur a domain crossing penalty on some chips though so
 /// it is better to avoid lowering through this for integer vectors where
 /// possible.
 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
 
   if (V2.isUndef()) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Straight shuffle of a single input vector. Simulate this by using the
     // single input as both of the "inputs" to this instruction..
     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
 
     if (Subtarget.hasAVX()) {
       // If we have AVX, we can use VPERMILPS which will allow folding a load
       // into the shuffle.
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
     }
 
     return DAG.getNode(
         X86ISD::SHUFP, DL, MVT::v2f64,
         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
   }
   assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
   assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
 
   // When loading a scalar and then shuffling it into a vector we can often do
   // the insertion cheaply.
   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
           DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return Insertion;
   // Try inverting the insertion since for v2 masks it is easy to do and we
   // can't reliably sort the mask one way or the other.
   int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
                         Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
           DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
     return Insertion;
 
   // Try to use one of the special instruction patterns to handle two common
   // blend patterns if a zero-blend above didn't work.
   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
       // We can either use a special instruction to load over the low double or
       // to move just the low double.
       return DAG.getNode(
           X86ISD::MOVSD, DL, MVT::v2f64, V2,
           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
 
   if (Subtarget.hasSSE41())
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
     return V;
 
   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
 }
 
 /// Handle lowering of 2-lane 64-bit integer shuffles.
 ///
 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
 /// the integer unit to minimize domain crossing penalties. However, for blends
 /// it falls back to the floating point shuffle operation with appropriate bit
 /// casting.
 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
 
   if (V2.isUndef()) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
     // We have to map the mask as it is actually a v4i32 shuffle instruction.
     V1 = DAG.getBitcast(MVT::v4i32, V1);
     int WidenedMask[4] = {
         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
     return DAG.getBitcast(
         MVT::v2i64,
         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
   }
   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // When loading a scalar and then shuffling it into a vector we can often do
   // the insertion cheaply.
   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return Insertion;
   // Try inverting the insertion since for v2 masks it is easy to do and we
   // can't reliably sort the mask one way or the other.
   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
     return Insertion;
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
   bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
     return V;
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   if (Subtarget.hasSSSE3()) {
     if (Subtarget.hasVLX())
       if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
                                                       Mask, Subtarget, DAG))
         return Rotate;
 
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
   }
 
   // If we have direct support for blends, we should lower by decomposing into
   // a permute. That will be faster than the domain cross.
   if (IsBlendSupported)
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
                                                       Mask, Subtarget, DAG);
 
   // We implement this with SHUFPD which is pretty lame because it will likely
   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
   // However, all the alternatives are still more cycles and newer chips don't
   // have this problem. It would be really nice if x86 had better shuffles here.
   V1 = DAG.getBitcast(MVT::v2f64, V1);
   V2 = DAG.getBitcast(MVT::v2f64, V2);
   return DAG.getBitcast(MVT::v2i64,
                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
 }
 
 /// Test whether this can be lowered with a single SHUFPS instruction.
 ///
 /// This is used to disable more specialized lowerings when the shufps lowering
 /// will happen to be efficient.
 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
   // This routine only handles 128-bit shufps.
   assert(Mask.size() == 4 && "Unsupported mask size!");
   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
 
   // To lower with a single SHUFPS we need to have the low half and high half
   // each requiring a single input.
   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
     return false;
   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
     return false;
 
   return true;
 }
 
 /// Lower a vector shuffle using the SHUFPS instruction.
 ///
 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
 /// It makes no assumptions about whether this is the *best* lowering, it simply
 /// uses it.
 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
   SDValue LowV = V1, HighV = V2;
   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 1) {
     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
 
     // Compute the index adjacent to V2Index and in the same half by toggling
     // the low bit.
     int V2AdjIndex = V2Index ^ 1;
 
     if (Mask[V2AdjIndex] < 0) {
       // Handles all the cases where we have a single V2 element and an undef.
       // This will only ever happen in the high lanes because we commute the
       // vector otherwise.
       if (V2Index < 2)
         std::swap(LowV, HighV);
       NewMask[V2Index] -= 4;
     } else {
       // Handle the case where the V2 element ends up adjacent to a V1 element.
       // To make this work, blend them together as the first step.
       int V1Index = V2AdjIndex;
       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
 
       // Now proceed to reconstruct the final blend as we have the necessary
       // high or low half formed.
       if (V2Index < 2) {
         LowV = V2;
         HighV = V1;
       } else {
         HighV = V2;
       }
       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
     }
   } else if (NumV2Elements == 2) {
     if (Mask[0] < 4 && Mask[1] < 4) {
       // Handle the easy case where we have V1 in the low lanes and V2 in the
       // high lanes.
       NewMask[2] -= 4;
       NewMask[3] -= 4;
     } else if (Mask[2] < 4 && Mask[3] < 4) {
       // We also handle the reversed case because this utility may get called
       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
       // arrange things in the right direction.
       NewMask[0] -= 4;
       NewMask[1] -= 4;
       HighV = V1;
       LowV = V2;
     } else {
       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
       // trying to place elements directly, just blend them and set up the final
       // shuffle to place them.
 
       // The first two blend mask elements are for V1, the second two are for
       // V2.
       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
                           Mask[2] < 4 ? Mask[2] : Mask[3],
                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
 
       // Now we do a normal shuffle of V1 by giving V1 as both operands to
       // a blend.
       LowV = HighV = V1;
       NewMask[0] = Mask[0] < 4 ? 0 : 2;
       NewMask[1] = Mask[0] < 4 ? 2 : 0;
       NewMask[2] = Mask[2] < 4 ? 1 : 3;
       NewMask[3] = Mask[2] < 4 ? 3 : 1;
     }
   }
   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
 }
 
 /// Lower 4-lane 32-bit floating point shuffles.
 ///
 /// Uses instructions exclusively from the floating point unit to minimize
 /// domain crossing penalties, as these are sufficient to implement all v4f32
 /// shuffles.
 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Use even/odd duplicate instructions for masks that match their pattern.
     if (Subtarget.hasSSE3()) {
       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
     }
 
     if (Subtarget.hasAVX()) {
       // If we have AVX, we can use VPERMILPS which will allow folding a load
       // into the shuffle.
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
     }
 
     // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
     // in SSE1 because otherwise they are widened to v2f64 and never get here.
     if (!Subtarget.hasSSE2()) {
       if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
         return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
       if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
         return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
     }
 
     // Otherwise, use a straight shuffle of a single input vector. We pass the
     // input vector to both operands to simulate this with a SHUFPS.
     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   }
 
   // There are special ways we can lower some single-element blends. However, we
   // have custom ways we can lower more complex single-element blends below that
   // we defer to if both this and BLENDPS fail to match, so restrict this to
   // when the V2 input is targeting element 0 of the mask -- that is the fast
   // case here.
   if (NumV2Elements == 1 && Mask[0] >= 4)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return V;
 
   if (Subtarget.hasSSE41()) {
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
     // Use INSERTPS if we can complete the shuffle efficiently.
     if (SDValue V =
             lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
       return V;
 
     if (!isSingleSHUFPSMask(Mask))
       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
               DL, MVT::v4f32, V1, V2, Mask, DAG))
         return BlendPerm;
   }
 
   // Use low/high mov instructions. These are only valid in SSE1 because
   // otherwise they are widened to v2f64 and never get here.
   if (!Subtarget.hasSSE2()) {
     if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
       return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
     if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
       return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
   }
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
     return V;
 
   // Otherwise fall back to a SHUFPS lowering strategy.
   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
 }
 
 /// Lower 4-lane i32 vector shuffles.
 ///
 /// We try to handle these with integer-domain shuffles where we can, but for
 /// blends we use the floating point domain blend instructions.
 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
     // We coerce the shuffle pattern to be compatible with UNPCK instructions
     // but we aren't actually going to use the UNPCK instruction because doing
     // so prevents folding a load into this instruction or making a copy.
     const int UnpackLoMask[] = {0, 0, 1, 1};
     const int UnpackHiMask[] = {2, 2, 3, 3};
     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
       Mask = UnpackLoMask;
     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
       Mask = UnpackHiMask;
 
     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   }
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return V;
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
   bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
                                                    Zeroable, DAG))
     return Masked;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
     return V;
 
   // Try to use byte rotation instructions.
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   if (Subtarget.hasSSSE3()) {
     if (Subtarget.hasVLX())
       if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
                                                       Mask, Subtarget, DAG))
         return Rotate;
 
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
   }
 
   // Assume that a single SHUFPS is faster than an alternative sequence of
   // multiple instructions (even if the CPU has a domain penalty).
   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   if (!isSingleSHUFPSMask(Mask)) {
     // If we have direct support for blends, we should lower by decomposing into
     // a permute. That will be faster than the domain cross.
     if (IsBlendSupported)
       return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
                                                         Mask, Subtarget, DAG);
 
     // Try to lower by permuting the inputs into an unpack instruction.
     if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
       return Unpack;
   }
 
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
   // up the inputs, bypassing domain shift penalties that we would incur if we
   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   // relevant.
   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
   return DAG.getBitcast(MVT::v4i32, ShufPS);
 }
 
 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
 /// shuffle lowering, and the most complex part.
 ///
 /// The lowering strategy is to try to form pairs of input lanes which are
 /// targeted at the same half of the final vector, and then use a dword shuffle
 /// to place them onto the right half, and finally unpack the paired lanes into
 /// their final position.
 ///
 /// The exact breakdown of how to form these dword pairs and align them on the
 /// correct sides is really tricky. See the comments within the function for
 /// more of the details.
 ///
 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
 /// vector, form the analogous 128-bit 8-element Mask.
 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
 
   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
 
   // Attempt to directly match PSHUFLW or PSHUFHW.
   if (isUndefOrInRange(LoMask, 0, 4) &&
       isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
     return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
                        getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
   }
   if (isUndefOrInRange(HiMask, 4, 8) &&
       isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
     for (int i = 0; i != 4; ++i)
       HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
     return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                        getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
   }
 
   SmallVector<int, 4> LoInputs;
   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
   array_pod_sort(LoInputs.begin(), LoInputs.end());
   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   SmallVector<int, 4> HiInputs;
   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
   array_pod_sort(HiInputs.begin(), HiInputs.end());
   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   int NumLToL =
       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
   int NumHToL = LoInputs.size() - NumLToL;
   int NumLToH =
       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
   int NumHToH = HiInputs.size() - NumLToH;
   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
 
   // If we are shuffling values from one half - check how many different DWORD
   // pairs we need to create. If only 1 or 2 then we can perform this as a
   // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
   auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
                                ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
     V = DAG.getNode(ShufWOp, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
     V = DAG.getBitcast(PSHUFDVT, V);
     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
     return DAG.getBitcast(VT, V);
   };
 
   if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
     int PSHUFDMask[4] = { -1, -1, -1, -1 };
     SmallVector<std::pair<int, int>, 4> DWordPairs;
     int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
 
     // Collect the different DWORD pairs.
     for (int DWord = 0; DWord != 4; ++DWord) {
       int M0 = Mask[2 * DWord + 0];
       int M1 = Mask[2 * DWord + 1];
       M0 = (M0 >= 0 ? M0 % 4 : M0);
       M1 = (M1 >= 0 ? M1 % 4 : M1);
       if (M0 < 0 && M1 < 0)
         continue;
 
       bool Match = false;
       for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
         auto &DWordPair = DWordPairs[j];
         if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
             (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
           DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
           DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
           PSHUFDMask[DWord] = DOffset + j;
           Match = true;
           break;
         }
       }
       if (!Match) {
         PSHUFDMask[DWord] = DOffset + DWordPairs.size();
         DWordPairs.push_back(std::make_pair(M0, M1));
       }
     }
 
     if (DWordPairs.size() <= 2) {
       DWordPairs.resize(2, std::make_pair(-1, -1));
       int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
                               DWordPairs[1].first, DWordPairs[1].second};
       if ((NumHToL + NumHToH) == 0)
         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
       if ((NumLToL + NumLToH) == 0)
         return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
     }
   }
 
   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
   // such inputs we can swap two of the dwords across the half mark and end up
   // with <=2 inputs to each half in each half. Once there, we can fall through
   // to the generic code below. For example:
   //
   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
   //
   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
   // and an existing 2-into-2 on the other half. In this case we may have to
   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
   // because any other situation (including a 3-into-1 or 1-into-3 in the other
   // half than the one we target for fixing) will be fixed when we re-enter this
   // path. We will also combine away any sequence of PSHUFD instructions that
   // result into a single instruction. Here is an example of the tricky case:
   //
   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
   //
   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
   //
   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
   //
   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
   //
   // The result is fine to be handled by the generic logic.
   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
                           int AOffset, int BOffset) {
     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
            "Must call this with A having 3 or 1 inputs from the A half.");
     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
            "Must call this with B having 1 or 3 inputs from the B half.");
     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
 
     bool ThreeAInputs = AToAInputs.size() == 3;
 
     // Compute the index of dword with only one word among the three inputs in
     // a half by taking the sum of the half with three inputs and subtracting
     // the sum of the actual three inputs. The difference is the remaining
     // slot.
     int ADWord, BDWord;
     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
     int TripleNonInputIdx =
         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
     TripleDWord = TripleNonInputIdx / 2;
 
     // We use xor with one to compute the adjacent DWord to whichever one the
     // OneInput is in.
     OneInputDWord = (OneInput / 2) ^ 1;
 
     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
     // and BToA inputs. If there is also such a problem with the BToB and AToB
     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
     // is essential that we don't *create* a 3<-1 as then we might oscillate.
     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
       // Compute how many inputs will be flipped by swapping these DWords. We
       // need
       // to balance this to ensure we don't form a 3-1 shuffle in the other
       // half.
       int NumFlippedAToBInputs =
           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
       int NumFlippedBToBInputs =
           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
       if ((NumFlippedAToBInputs == 1 &&
            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
           (NumFlippedBToBInputs == 1 &&
            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
         // We choose whether to fix the A half or B half based on whether that
         // half has zero flipped inputs. At zero, we may not be able to fix it
         // with that half. We also bias towards fixing the B half because that
         // will more commonly be the high half, and we have to bias one way.
         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
                                                        ArrayRef<int> Inputs) {
           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
           // Determine whether the free index is in the flipped dword or the
           // unflipped dword based on where the pinned index is. We use this bit
           // in an xor to conditionally select the adjacent dword.
           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
           if (IsFixIdxInput == IsFixFreeIdxInput)
             FixFreeIdx += 1;
           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
           assert(IsFixIdxInput != IsFixFreeIdxInput &&
                  "We need to be changing the number of flipped inputs!");
           int PSHUFHalfMask[] = {0, 1, 2, 3};
           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
           V = DAG.getNode(
               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
 
           for (int &M : Mask)
             if (M >= 0 && M == FixIdx)
               M = FixFreeIdx;
             else if (M >= 0 && M == FixFreeIdx)
               M = FixIdx;
         };
         if (NumFlippedBToBInputs != 0) {
           int BPinnedIdx =
               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
         } else {
           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
         }
       }
     }
 
     int PSHUFDMask[] = {0, 1, 2, 3};
     PSHUFDMask[ADWord] = BDWord;
     PSHUFDMask[BDWord] = ADWord;
     V = DAG.getBitcast(
         VT,
         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
 
     // Adjust the mask to match the new locations of A and B.
     for (int &M : Mask)
       if (M >= 0 && M/2 == ADWord)
         M = 2 * BDWord + M % 2;
       else if (M >= 0 && M/2 == BDWord)
         M = 2 * ADWord + M % 2;
 
     // Recurse back into this routine to re-compute state now that this isn't
     // a 3 and 1 problem.
     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
                                                      DAG);
   };
   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
 
   // At this point there are at most two inputs to the low and high halves from
   // each half. That means the inputs can always be grouped into dwords and
   // those dwords can then be moved to the correct half with a dword shuffle.
   // We use at most one low and one high word shuffle to collect these paired
   // inputs into dwords, and finally a dword shuffle to place them.
   int PSHUFLMask[4] = {-1, -1, -1, -1};
   int PSHUFHMask[4] = {-1, -1, -1, -1};
   int PSHUFDMask[4] = {-1, -1, -1, -1};
 
   // First fix the masks for all the inputs that are staying in their
   // original halves. This will then dictate the targets of the cross-half
   // shuffles.
   auto fixInPlaceInputs =
       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
                     MutableArrayRef<int> SourceHalfMask,
                     MutableArrayRef<int> HalfMask, int HalfOffset) {
     if (InPlaceInputs.empty())
       return;
     if (InPlaceInputs.size() == 1) {
       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
           InPlaceInputs[0] - HalfOffset;
       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
       return;
     }
     if (IncomingInputs.empty()) {
       // Just fix all of the in place inputs.
       for (int Input : InPlaceInputs) {
         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
         PSHUFDMask[Input / 2] = Input / 2;
       }
       return;
     }
 
     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
         InPlaceInputs[0] - HalfOffset;
     // Put the second input next to the first so that they are packed into
     // a dword. We find the adjacent index by toggling the low bit.
     int AdjIndex = InPlaceInputs[0] ^ 1;
     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
   };
   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
 
   // Now gather the cross-half inputs and place them into a free dword of
   // their target half.
   // FIXME: This operation could almost certainly be simplified dramatically to
   // look more like the 3-1 fixing operation.
   auto moveInputsToRightHalf = [&PSHUFDMask](
       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
       int DestOffset) {
     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
     };
     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
                                                int Word) {
       int LowWord = Word & ~1;
       int HighWord = Word | 1;
       return isWordClobbered(SourceHalfMask, LowWord) ||
              isWordClobbered(SourceHalfMask, HighWord);
     };
 
     if (IncomingInputs.empty())
       return;
 
     if (ExistingInputs.empty()) {
       // Map any dwords with inputs from them into the right half.
       for (int Input : IncomingInputs) {
         // If the source half mask maps over the inputs, turn those into
         // swaps and use the swapped lane.
         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
                 Input - SourceOffset;
             // We have to swap the uses in our half mask in one sweep.
             for (int &M : HalfMask)
               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
                 M = Input;
               else if (M == Input)
                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
           } else {
             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
                        Input - SourceOffset &&
                    "Previous placement doesn't match!");
           }
           // Note that this correctly re-maps both when we do a swap and when
           // we observe the other side of the swap above. We rely on that to
           // avoid swapping the members of the input list directly.
           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
         }
 
         // Map the input's dword into the correct half.
         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
         else
           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
                      Input / 2 &&
                  "Previous placement doesn't match!");
       }
 
       // And just directly shift any other-half mask elements to be same-half
       // as we will have mirrored the dword containing the element into the
       // same position within that half.
       for (int &M : HalfMask)
         if (M >= SourceOffset && M < SourceOffset + 4) {
           M = M - SourceOffset + DestOffset;
           assert(M >= 0 && "This should never wrap below zero!");
         }
       return;
     }
 
     // Ensure we have the input in a viable dword of its current half. This
     // is particularly tricky because the original position may be clobbered
     // by inputs being moved and *staying* in that half.
     if (IncomingInputs.size() == 1) {
       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
                          SourceOffset;
         SourceHalfMask[InputFixed - SourceOffset] =
             IncomingInputs[0] - SourceOffset;
         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
                      InputFixed);
         IncomingInputs[0] = InputFixed;
       }
     } else if (IncomingInputs.size() == 2) {
       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
         // We have two non-adjacent or clobbered inputs we need to extract from
         // the source half. To do this, we need to map them into some adjacent
         // dword slot in the source mask.
         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
                               IncomingInputs[1] - SourceOffset};
 
         // If there is a free slot in the source half mask adjacent to one of
         // the inputs, place the other input in it. We use (Index XOR 1) to
         // compute an adjacent index.
         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
           InputsFixed[1] = InputsFixed[0] ^ 1;
         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
           InputsFixed[0] = InputsFixed[1] ^ 1;
         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
           // The two inputs are in the same DWord but it is clobbered and the
           // adjacent DWord isn't used at all. Move both inputs to the free
           // slot.
           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
         } else {
           // The only way we hit this point is if there is no clobbering
           // (because there are no off-half inputs to this half) and there is no
           // free slot adjacent to one of the inputs. In this case, we have to
           // swap an input with a non-input.
           for (int i = 0; i < 4; ++i)
             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
                    "We can't handle any clobbers here!");
           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
                  "Cannot have adjacent inputs here!");
 
           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
 
           // We also have to update the final source mask in this case because
           // it may need to undo the above swap.
           for (int &M : FinalSourceHalfMask)
             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
               M = InputsFixed[1] + SourceOffset;
             else if (M == InputsFixed[1] + SourceOffset)
               M = (InputsFixed[0] ^ 1) + SourceOffset;
 
           InputsFixed[1] = InputsFixed[0] ^ 1;
         }
 
         // Point everything at the fixed inputs.
         for (int &M : HalfMask)
           if (M == IncomingInputs[0])
             M = InputsFixed[0] + SourceOffset;
           else if (M == IncomingInputs[1])
             M = InputsFixed[1] + SourceOffset;
 
         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
       }
     } else {
       llvm_unreachable("Unhandled input size!");
     }
 
     // Now hoist the DWord down to the right half.
     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
     for (int &M : HalfMask)
       for (int Input : IncomingInputs)
         if (M == Input)
           M = FreeDWord * 2 + Input % 2;
   };
   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
 
   // Now enact all the shuffles we've computed to move the inputs into their
   // target half.
   if (!isNoopShuffleMask(PSHUFLMask))
     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
   if (!isNoopShuffleMask(PSHUFHMask))
     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
   if (!isNoopShuffleMask(PSHUFDMask))
     V = DAG.getBitcast(
         VT,
         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
 
   // At this point, each half should contain all its inputs, and we can then
   // just shuffle them into their final position.
   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
          "Failed to lift all the high half inputs to the low mask!");
   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
          "Failed to lift all the low half inputs to the high mask!");
 
   // Do a half shuffle for the low mask.
   if (!isNoopShuffleMask(LoMask))
     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
 
   // Do a half shuffle with the high mask after shifting its values down.
   for (int &M : HiMask)
     if (M >= 0)
       M -= 4;
   if (!isNoopShuffleMask(HiMask))
     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
 
   return V;
 }
 
 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
 /// blend if only one input is used.
 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
   assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&
          "Lane crossing shuffle masks not supported");
 
   int NumBytes = VT.getSizeInBits() / 8;
   int Size = Mask.size();
   int Scale = NumBytes / Size;
 
   SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
   SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
   V1InUse = false;
   V2InUse = false;
 
   for (int i = 0; i < NumBytes; ++i) {
     int M = Mask[i / Scale];
     if (M < 0)
       continue;
 
     const int ZeroMask = 0x80;
     int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
     int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
     if (Zeroable[i / Scale])
       V1Idx = V2Idx = ZeroMask;
 
     V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
     V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
     V1InUse |= (ZeroMask != V1Idx);
     V2InUse |= (ZeroMask != V2Idx);
   }
 
   MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
   if (V1InUse)
     V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
                      DAG.getBuildVector(ShufVT, DL, V1Mask));
   if (V2InUse)
     V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
                      DAG.getBuildVector(ShufVT, DL, V2Mask));
 
   // If we need shuffled inputs from both, blend the two.
   SDValue V;
   if (V1InUse && V2InUse)
     V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
   else
     V = V1InUse ? V1 : V2;
 
   // Cast the result back to the correct type.
   return DAG.getBitcast(VT, V);
 }
 
 /// Generic lowering of 8-lane i16 shuffles.
 ///
 /// This handles both single-input shuffles and combined shuffle/blends with
 /// two inputs. The single input shuffles are immediately delegated to
 /// a dedicated lowering routine.
 ///
 /// The blends are lowered in one of three fundamental ways. If there are few
 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
 /// of the input is significantly cheaper when lowered as an interleaving of
 /// the two inputs, try to interleave them. Otherwise, blend the low and high
 /// halves of the inputs separately (making them have relatively few inputs)
 /// and then concatenate them.
 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
 
   if (NumV2Inputs == 0) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Try to use shift instructions.
     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Shift;
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
       return V;
 
     // Use dedicated pack instructions for masks that match their pattern.
     if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
                                                DAG, Subtarget))
       return V;
 
     // Try to use byte rotation instructions.
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
                                                         Mask, Subtarget, DAG))
       return Rotate;
 
     // Make a copy of the mask so it can be modified.
     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
                                                      MutableMask, Subtarget,
                                                      DAG);
   }
 
   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
          "All single-input shuffles should be canonicalized to be V1-input "
          "shuffles.");
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
                                                 Zeroable, DAG))
       return V;
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Inputs == 1)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return V;
 
   // We have different paths for blend lowering, but they all must use the
   // *exact* same predicate.
   bool IsBlendSupported = Subtarget.hasSSE41();
   if (IsBlendSupported)
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
                                                    Zeroable, DAG))
     return Masked;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
     return V;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   if (SDValue BitBlend =
           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
     return BitBlend;
 
   // Try to lower by permuting the inputs into an unpack instruction.
   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
     return Unpack;
 
   // If we can't directly blend but can use PSHUFB, that will be better as it
   // can both shuffle and set up the inefficient blend.
   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
     bool V1InUse, V2InUse;
     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
                                               Zeroable, DAG, V1InUse, V2InUse);
   }
 
   // We can always bit-blend if we have to so the fallback strategy is to
   // decompose into single-input permutes and blends.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
                                                     Mask, Subtarget, DAG);
 }
 
 /// Check whether a compaction lowering can be done by dropping even
 /// elements and compute how many times even elements must be dropped.
 ///
 /// This handles shuffles which take every Nth element where N is a power of
 /// two. Example shuffle masks:
 ///
 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
 ///
 /// Any of these lanes can of course be undef.
 ///
 /// This routine only supports N <= 3.
 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
 /// for larger N.
 ///
 /// \returns N above, or the number of times even elements must be dropped if
 /// there is such a number. Otherwise returns zero.
 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
                                           bool IsSingleInput) {
   // The modulus for the shuffle vector entries is based on whether this is
   // a single input or not.
   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
          "We should only be called with masks with a power-of-2 size!");
 
   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
 
   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
   // and 2^3 simultaneously. This is because we may have ambiguity with
   // partially undef inputs.
   bool ViableForN[3] = {true, true, true};
 
   for (int i = 0, e = Mask.size(); i < e; ++i) {
     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
     // want.
     if (Mask[i] < 0)
       continue;
 
     bool IsAnyViable = false;
     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
       if (ViableForN[j]) {
         uint64_t N = j + 1;
 
         // The shuffle mask must be equal to (i * 2^N) % M.
         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
           IsAnyViable = true;
         else
           ViableForN[j] = false;
       }
     // Early exit if we exhaust the possible powers of two.
     if (!IsAnyViable)
       break;
   }
 
   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
     if (ViableForN[j])
       return j + 1;
 
   // Return 0 as there is no viable power of two.
   return 0;
 }
 
 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
                                            ArrayRef<int> Mask, SDValue V1,
                                            SDValue V2, SelectionDAG &DAG) {
   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
 
   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
   if (V2.isUndef())
     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
 
   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
 }
 
 /// Generic lowering of v16i8 shuffles.
 ///
 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
 /// detect any complexity reducing interleaving. If that doesn't help, it uses
 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
 /// back together.
 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use a zext lowering.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // See if we can use SSE4A Extraction / Insertion.
   if (Subtarget.hasSSE4A())
     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
                                                 Zeroable, DAG))
       return V;
 
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
 
   // For single-input shuffles, there are some nicer lowering tricks we can use.
   if (NumV2Elements == 0) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
       return V;
 
     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
     // Notably, this handles splat and partial-splat shuffles more efficiently.
     // However, it only makes sense if the pre-duplication shuffle simplifies
     // things significantly. Currently, this means we need to be able to
     // express the pre-duplication shuffle as an i16 shuffle.
     //
     // FIXME: We should check for other patterns which can be widened into an
     // i16 shuffle as well.
     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
       for (int i = 0; i < 16; i += 2)
         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
           return false;
 
       return true;
     };
     auto tryToWidenViaDuplication = [&]() -> SDValue {
       if (!canWidenViaDuplication(Mask))
         return SDValue();
       SmallVector<int, 4> LoInputs;
       copy_if(Mask, std::back_inserter(LoInputs),
               [](int M) { return M >= 0 && M < 8; });
       array_pod_sort(LoInputs.begin(), LoInputs.end());
       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
                      LoInputs.end());
       SmallVector<int, 4> HiInputs;
       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
       array_pod_sort(HiInputs.begin(), HiInputs.end());
       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
                      HiInputs.end());
 
       bool TargetLo = LoInputs.size() >= HiInputs.size();
       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
 
       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
       SmallDenseMap<int, int, 8> LaneMap;
       for (int I : InPlaceInputs) {
         PreDupI16Shuffle[I/2] = I/2;
         LaneMap[I] = I;
       }
       int j = TargetLo ? 0 : 4, je = j + 4;
       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
         // Check if j is already a shuffle of this input. This happens when
         // there are two adjacent bytes after we move the low one.
         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
           // If we haven't yet mapped the input, search for a slot into which
           // we can map it.
           while (j < je && PreDupI16Shuffle[j] >= 0)
             ++j;
 
           if (j == je)
             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
             return SDValue();
 
           // Map this input with the i16 shuffle.
           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
         }
 
         // Update the lane map based on the mapping we ended up with.
         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
       }
       V1 = DAG.getBitcast(
           MVT::v16i8,
           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
 
       // Unpack the bytes to form the i16s that will be shuffled into place.
       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
                        MVT::v16i8, V1, V1);
 
       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
       for (int i = 0; i < 16; ++i)
         if (Mask[i] >= 0) {
           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
           if (PostDupI16Shuffle[i / 2] < 0)
             PostDupI16Shuffle[i / 2] = MappedMask;
           else
             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
                    "Conflicting entries in the original shuffle!");
         }
       return DAG.getBitcast(
           MVT::v16i8,
           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
     };
     if (SDValue V = tryToWidenViaDuplication())
       return V;
   }
 
   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
                                                    Zeroable, DAG))
     return Masked;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
     return V;
 
   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   // with PSHUFB. It is important to do this before we attempt to generate any
   // blends but after all of the single-input lowerings. If the single input
   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
   // want to preserve that and we can DAG combine any longer sequences into
   // a PSHUFB in the end. But once we start blending from multiple inputs,
   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
   // and there are *very* few patterns that would actually be faster than the
   // PSHUFB approach because of its ability to zero lanes.
   //
   // FIXME: The only exceptions to the above are blends which are exact
   // interleavings with direct instructions supporting them. We currently don't
   // handle those well here.
   if (Subtarget.hasSSSE3()) {
     bool V1InUse = false;
     bool V2InUse = false;
 
     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
 
     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
     // do so. This avoids using them to handle blends-with-zero which is
     // important as a single pshufb is significantly faster for that.
     if (V1InUse && V2InUse) {
       if (Subtarget.hasSSE41())
         if (SDValue Blend = lowerVectorShuffleAsBlend(
                 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
           return Blend;
 
       // We can use an unpack to do the blending rather than an or in some
       // cases. Even though the or may be (very minorly) more efficient, we
       // preference this lowering because there are common cases where part of
       // the complexity of the shuffles goes away when we do the final blend as
       // an unpack.
       // FIXME: It might be worth trying to detect if the unpack-feeding
       // shuffles will both be pshufb, in which case we shouldn't bother with
       // this.
       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
         return Unpack;
 
       // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
       if (Subtarget.hasVBMI() && Subtarget.hasVLX())
         return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
 
       // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
       // PALIGNR will be cheaper than the second PSHUFB+OR.
       if (SDValue V = lowerVectorShuffleAsByteRotateAndPermute(
               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
         return V;
     }
 
     return PSHUFB;
   }
 
   // There are special ways we can lower some single-element blends.
   if (NumV2Elements == 1)
     if (SDValue V = lowerVectorShuffleAsElementInsertion(
             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return V;
 
   if (SDValue BitBlend =
           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
     return BitBlend;
 
   // Check whether a compaction lowering can be done. This handles shuffles
   // which take every Nth element for some even N. See the helper function for
   // details.
   //
   // We special case these as they can be particularly efficiently handled with
   // the PACKUSB instruction on x86 and they show up in common patterns of
   // rearranging bytes to truncate wide elements.
   bool IsSingleInput = V2.isUndef();
   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
     // NumEvenDrops is the power of two stride of the elements. Another way of
     // thinking about it is that we need to drop the even elements this many
     // times to get the original input.
 
     // First we need to zero all the dropped bytes.
     assert(NumEvenDrops <= 3 &&
            "No support for dropping even elements more than 3 times.");
     // We use the mask type to pick which bytes are preserved based on how many
     // elements are dropped.
     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
     SDValue ByteClearMask = DAG.getBitcast(
         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
     if (!IsSingleInput)
       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
 
     // Now pack things back together.
     V1 = DAG.getBitcast(MVT::v8i16, V1);
     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
     for (int i = 1; i < NumEvenDrops; ++i) {
       Result = DAG.getBitcast(MVT::v8i16, Result);
       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
     }
 
     return Result;
   }
 
   // Handle multi-input cases by blending single-input shuffles.
   if (NumV2Elements > 0)
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
                                                       Mask, Subtarget, DAG);
 
   // The fallback path for single-input shuffles widens this into two v8i16
   // vectors with unpacks, shuffles those, and then pulls them back together
   // with a pack.
   SDValue V = V1;
 
   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
   for (int i = 0; i < 16; ++i)
     if (Mask[i] >= 0)
       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
 
   SDValue VLoHalf, VHiHalf;
   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
   // them out and avoid using UNPCK{L,H} to extract the elements of V as
   // i16s.
   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
     // Use a mask to drop the high bytes.
     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
 
     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
     VHiHalf = DAG.getUNDEF(MVT::v8i16);
 
     // Squash the masks to point directly into VLoHalf.
     for (int &M : LoBlendMask)
       if (M >= 0)
         M /= 2;
     for (int &M : HiBlendMask)
       if (M >= 0)
         M /= 2;
   } else {
     // Otherwise just unpack the low half of V into VLoHalf and the high half into
     // VHiHalf so that we can blend them as i16s.
     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
 
     VLoHalf = DAG.getBitcast(
         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
     VHiHalf = DAG.getBitcast(
         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
   }
 
   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
 
   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
 }
 
 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
 ///
 /// This routine breaks down the specific type of 128-bit shuffle and
 /// dispatches to the lowering routines accordingly.
 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
                                         const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   switch (VT.SimpleTy) {
   case MVT::v2i64:
     return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v2f64:
     return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v4i32:
     return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v4f32:
     return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i16:
     return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i8:
     return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Unimplemented!");
   }
 }
 
 /// Generic routine to split vector shuffle into half-sized shuffles.
 ///
 /// This routine just extracts two subvectors, shuffles them independently, and
 /// then concatenates them back together. This should work effectively with all
 /// AVX vector shuffle types.
 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                           SDValue V2, ArrayRef<int> Mask,
                                           SelectionDAG &DAG) {
   assert(VT.getSizeInBits() >= 256 &&
          "Only for 256-bit or wider vector shuffles!");
   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
 
   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
 
   int NumElements = VT.getVectorNumElements();
   int SplitNumElements = NumElements / 2;
   MVT ScalarVT = VT.getVectorElementType();
   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
 
   // Rather than splitting build-vectors, just build two narrower build
   // vectors. This helps shuffling with splats and zeros.
   auto SplitVector = [&](SDValue V) {
     V = peekThroughBitcasts(V);
 
     MVT OrigVT = V.getSimpleValueType();
     int OrigNumElements = OrigVT.getVectorNumElements();
     int OrigSplitNumElements = OrigNumElements / 2;
     MVT OrigScalarVT = OrigVT.getVectorElementType();
     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
 
     SDValue LoV, HiV;
 
     auto *BV = dyn_cast<BuildVectorSDNode>(V);
     if (!BV) {
       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
                         DAG.getIntPtrConstant(0, DL));
       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
     } else {
 
       SmallVector<SDValue, 16> LoOps, HiOps;
       for (int i = 0; i < OrigSplitNumElements; ++i) {
         LoOps.push_back(BV->getOperand(i));
         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
       }
       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
     }
     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
                           DAG.getBitcast(SplitVT, HiV));
   };
 
   SDValue LoV1, HiV1, LoV2, HiV2;
   std::tie(LoV1, HiV1) = SplitVector(V1);
   std::tie(LoV2, HiV2) = SplitVector(V2);
 
   // Now create two 4-way blends of these half-width vectors.
   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
     for (int i = 0; i < SplitNumElements; ++i) {
       int M = HalfMask[i];
       if (M >= NumElements) {
         if (M >= NumElements + SplitNumElements)
           UseHiV2 = true;
         else
           UseLoV2 = true;
         V2BlendMask[i] = M - NumElements;
         BlendMask[i] = SplitNumElements + i;
       } else if (M >= 0) {
         if (M >= SplitNumElements)
           UseHiV1 = true;
         else
           UseLoV1 = true;
         V1BlendMask[i] = M;
         BlendMask[i] = i;
       }
     }
 
     // Because the lowering happens after all combining takes place, we need to
     // manually combine these blend masks as much as possible so that we create
     // a minimal number of high-level vector shuffle nodes.
 
     // First try just blending the halves of V1 or V2.
     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
       return DAG.getUNDEF(SplitVT);
     if (!UseLoV2 && !UseHiV2)
       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
     if (!UseLoV1 && !UseHiV1)
       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
 
     SDValue V1Blend, V2Blend;
     if (UseLoV1 && UseHiV1) {
       V1Blend =
         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
     } else {
       // We only use half of V1 so map the usage down into the final blend mask.
       V1Blend = UseLoV1 ? LoV1 : HiV1;
       for (int i = 0; i < SplitNumElements; ++i)
         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
     }
     if (UseLoV2 && UseHiV2) {
       V2Blend =
         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
     } else {
       // We only use half of V2 so map the usage down into the final blend mask.
       V2Blend = UseLoV2 ? LoV2 : HiV2;
       for (int i = 0; i < SplitNumElements; ++i)
         if (BlendMask[i] >= SplitNumElements)
           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
     }
     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
   };
   SDValue Lo = HalfBlend(LoMask);
   SDValue Hi = HalfBlend(HiMask);
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
 }
 
 /// Either split a vector in halves or decompose the shuffles and the
 /// blend.
 ///
 /// This is provided as a good fallback for many lowerings of non-single-input
 /// shuffles with more than one 128-bit lane. In those cases, we want to select
 /// between splitting the shuffle into 128-bit components and stitching those
 /// back together vs. extracting the single-input shuffles and blending those
 /// results.
 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
                                                 SDValue V1, SDValue V2,
                                                 ArrayRef<int> Mask,
                                                 const X86Subtarget &Subtarget,
                                                 SelectionDAG &DAG) {
   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
          "shuffles as it could then recurse on itself.");
   int Size = Mask.size();
 
   // If this can be modeled as a broadcast of two elements followed by a blend,
   // prefer that lowering. This is especially important because broadcasts can
   // often fold with memory operands.
   auto DoBothBroadcast = [&] {
     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
     for (int M : Mask)
       if (M >= Size) {
         if (V2BroadcastIdx < 0)
           V2BroadcastIdx = M - Size;
         else if (M - Size != V2BroadcastIdx)
           return false;
       } else if (M >= 0) {
         if (V1BroadcastIdx < 0)
           V1BroadcastIdx = M;
         else if (M != V1BroadcastIdx)
           return false;
       }
     return true;
   };
   if (DoBothBroadcast())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
                                                       Subtarget, DAG);
 
   // If the inputs all stem from a single 128-bit lane of each input, then we
   // split them rather than blending because the split will decompose to
   // unusually few instructions.
   int LaneCount = VT.getSizeInBits() / 128;
   int LaneSize = Size / LaneCount;
   SmallBitVector LaneInputs[2];
   LaneInputs[0].resize(LaneCount, false);
   LaneInputs[1].resize(LaneCount, false);
   for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0)
       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
 
   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
   // that the decomposed single-input shuffles don't end up here.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
                                                     Subtarget, DAG);
 }
 
 /// Lower a vector shuffle crossing multiple 128-bit lanes as
 /// a lane permutation followed by a per-lane permutation.
 ///
 /// This is mainly for cases where we can have non-repeating permutes
 /// in each lane.
 ///
 /// TODO: This is very similar to lowerVectorShuffleByMerging128BitLanes,
 /// we should investigate merging them.
 static SDValue lowerVectorShuffleAsLanePermuteAndPermute(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   int NumElts = VT.getVectorNumElements();
   int NumLanes = VT.getSizeInBits() / 128;
   int NumEltsPerLane = NumElts / NumLanes;
 
   SmallVector<int, 4> SrcLaneMask(NumLanes, SM_SentinelUndef);
   SmallVector<int, 16> PermMask(NumElts, SM_SentinelUndef);
 
   for (int i = 0; i != NumElts; ++i) {
     int M = Mask[i];
     if (M < 0)
       continue;
 
     // Ensure that each lane comes from a single source lane.
     int SrcLane = M / NumEltsPerLane;
     int DstLane = i / NumEltsPerLane;
     if (!isUndefOrEqual(SrcLaneMask[DstLane], SrcLane))
       return SDValue();
     SrcLaneMask[DstLane] = SrcLane;
 
     PermMask[i] = (DstLane * NumEltsPerLane) + (M % NumEltsPerLane);
   }
 
   // Make sure we set all elements of the lane mask, to avoid undef propagation.
   SmallVector<int, 16> LaneMask(NumElts, SM_SentinelUndef);
   for (int DstLane = 0; DstLane != NumLanes; ++DstLane) {
     int SrcLane = SrcLaneMask[DstLane];
     if (0 <= SrcLane)
       for (int j = 0; j != NumEltsPerLane; ++j) {
         LaneMask[(DstLane * NumEltsPerLane) + j] =
             (SrcLane * NumEltsPerLane) + j;
       }
   }
 
   // If we're only shuffling a single lowest lane and the rest are identity
   // then don't bother.
   // TODO - isShuffleMaskInputInPlace could be extended to something like this.
   int NumIdentityLanes = 0;
   bool OnlyShuffleLowestLane = true;
   for (int i = 0; i != NumLanes; ++i) {
     if (isSequentialOrUndefInRange(PermMask, i * NumEltsPerLane, NumEltsPerLane,
                                    i * NumEltsPerLane))
       NumIdentityLanes++;
     else if (SrcLaneMask[i] != 0 && SrcLaneMask[i] != NumLanes)
       OnlyShuffleLowestLane = false;
   }
   if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
     return SDValue();
 
   SDValue LanePermute = DAG.getVectorShuffle(VT, DL, V1, V2, LaneMask);
   return DAG.getVectorShuffle(VT, DL, LanePermute, DAG.getUNDEF(VT), PermMask);
 }
 
 /// Lower a vector shuffle crossing multiple 128-bit lanes as
 /// a permutation and blend of those lanes.
 ///
 /// This essentially blends the out-of-lane inputs to each lane into the lane
 /// from a permuted copy of the vector. This lowering strategy results in four
 /// instructions in the worst case for a single-input cross lane shuffle which
 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
 /// of. Special cases for each particular shuffle pattern should be handled
 /// prior to trying this lowering.
 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
                                                        SDValue V1, SDValue V2,
                                                        ArrayRef<int> Mask,
                                                        SelectionDAG &DAG,
                                                        const X86Subtarget &Subtarget) {
   // FIXME: This should probably be generalized for 512-bit vectors as well.
   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
   int Size = Mask.size();
   int LaneSize = Size / 2;
 
   // If there are only inputs from one 128-bit lane, splitting will in fact be
   // less expensive. The flags track whether the given lane contains an element
   // that crosses to another lane.
   if (!Subtarget.hasAVX2()) {
     bool LaneCrossing[2] = {false, false};
     for (int i = 0; i < Size; ++i)
       if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
     if (!LaneCrossing[0] || !LaneCrossing[1])
       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   } else {
     bool LaneUsed[2] = {false, false};
     for (int i = 0; i < Size; ++i)
       if (Mask[i] >= 0)
         LaneUsed[(Mask[i] / LaneSize)] = true;
     if (!LaneUsed[0] || !LaneUsed[1])
       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
   }
 
   assert(V2.isUndef() &&
          "This last part of this routine only works on single input shuffles");
 
   SmallVector<int, 32> FlippedBlendMask(Size);
   for (int i = 0; i < Size; ++i)
     FlippedBlendMask[i] =
         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
                                 ? Mask[i]
                                 : Mask[i] % LaneSize +
                                       (i / LaneSize) * LaneSize + Size);
 
   // Flip the vector, and blend the results which should now be in-lane.
   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
   SDValue Flipped = DAG.getBitcast(PVT, V1);
   Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
                                  { 2, 3, 0, 1 });
   Flipped = DAG.getBitcast(VT, Flipped);
   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
 }
 
 /// Handle lowering 2-lane 128-bit shuffles.
 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                         SDValue V2, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
   if (Subtarget.hasAVX2() && V2.isUndef())
     return SDValue();
 
   SmallVector<int, 4> WidenedMask;
   if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
     return SDValue();
 
   bool IsLowZero = (Zeroable & 0x3) == 0x3;
   bool IsHighZero = (Zeroable & 0xc) == 0xc;
 
   // Try to use an insert into a zero vector.
   if (WidenedMask[0] == 0 && IsHighZero) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                               DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   // TODO: If minimizing size and one of the inputs is a zero vector and the
   // the zero vector has only one use, we could use a VPERM2X128 to save the
   // instruction bytes needed to explicitly generate the zero vector.
 
   // Blends are faster and handle all the non-lane-crossing cases.
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // If either input operand is a zero vector, use VPERM2X128 because its mask
   // allows us to replace the zero input with an implicit zero.
   if (!IsLowZero && !IsHighZero) {
     // Check for patterns which can be matched with a single insert of a 128-bit
     // subvector.
     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
 
       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
       // this will likely become vinsertf128 which can't fold a 256-bit memop.
       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
         SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
                                      OnlyUsesV1 ? V1 : V2,
                                      DAG.getIntPtrConstant(0, DL));
         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
                            DAG.getIntPtrConstant(2, DL));
       }
     }
 
     // Try to use SHUF128 if possible.
     if (Subtarget.hasVLX()) {
       if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
         unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
                             ((WidenedMask[1] % 2) << 1);
       return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
                          DAG.getConstant(PermMask, DL, MVT::i8));
       }
     }
   }
 
   // Otherwise form a 128-bit permutation. After accounting for undefs,
   // convert the 64-bit shuffle mask selection values into 128-bit
   // selection bits by dividing the indexes by 2 and shifting into positions
   // defined by a vperm2*128 instruction's immediate control byte.
 
   // The immediate permute control byte looks like this:
   //    [1:0] - select 128 bits from sources for low half of destination
   //    [2]   - ignore
   //    [3]   - zero low half of destination
   //    [5:4] - select 128 bits from sources for high half of destination
   //    [6]   - ignore
   //    [7]   - zero high half of destination
 
   assert((WidenedMask[0] >= 0 || IsLowZero) &&
          (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
 
   unsigned PermMask = 0;
   PermMask |= IsLowZero  ? 0x08 : (WidenedMask[0] << 0);
   PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
 
   // Check the immediate mask and replace unused sources with undef.
   if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
     V1 = DAG.getUNDEF(VT);
   if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
     V2 = DAG.getUNDEF(VT);
 
   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
                      DAG.getConstant(PermMask, DL, MVT::i8));
 }
 
 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
 /// shuffling each lane.
 ///
 /// This attempts to create a repeated lane shuffle where each lane uses one
 /// or two of the lanes of the inputs. The lanes of the input vectors are
 /// shuffled in one or two independent shuffles to get the lanes into the
 /// position needed by the final shuffle.
 ///
 /// FIXME: This should be generalized to 512-bit shuffles.
 static SDValue lowerVectorShuffleByMerging128BitLanes(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
 
   if (is128BitLaneRepeatedShuffleMask(VT, Mask))
     return SDValue();
 
   int Size = Mask.size();
   int LaneSize = 128 / VT.getScalarSizeInBits();
   int NumLanes = Size / LaneSize;
   assert(NumLanes == 2 && "Only handles 256-bit shuffles.");
 
   SmallVector<int, 16> RepeatMask(LaneSize, -1);
   int LaneSrcs[2][2] = { { -1, -1 }, { -1 , -1 } };
 
   // First pass will try to fill in the RepeatMask from lanes that need two
   // sources.
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
     int Srcs[2] = { -1, -1 };
     SmallVector<int, 16> InLaneMask(LaneSize, -1);
     for (int i = 0; i != LaneSize; ++i) {
       int M = Mask[(Lane * LaneSize) + i];
       if (M < 0)
         continue;
       // Determine which of the 4 possible input lanes (2 from each source)
       // this element comes from. Assign that as one of the sources for this
       // lane. We can assign up to 2 sources for this lane. If we run out
       // sources we can't do anything.
       int LaneSrc = M / LaneSize;
       int Src;
       if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
         Src = 0;
       else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
         Src = 1;
       else
         return SDValue();
 
       Srcs[Src] = LaneSrc;
       InLaneMask[i] = (M % LaneSize) + Src * Size;
     }
 
     // If this lane has two sources, see if it fits with the repeat mask so far.
     if (Srcs[1] < 0)
       continue;
 
     LaneSrcs[Lane][0] = Srcs[0];
     LaneSrcs[Lane][1] = Srcs[1];
 
     auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
       assert(M1.size() == M2.size() && "Unexpected mask size");
       for (int i = 0, e = M1.size(); i != e; ++i)
         if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
           return false;
       return true;
     };
 
     auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
       assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
       for (int i = 0, e = MergedMask.size(); i != e; ++i) {
         int M = Mask[i];
         if (M < 0)
           continue;
         assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
                "Unexpected mask element");
         MergedMask[i] = M;
       }
     };
 
     if (MatchMasks(InLaneMask, RepeatMask)) {
       // Merge this lane mask into the final repeat mask.
       MergeMasks(InLaneMask, RepeatMask);
       continue;
     }
 
     // Didn't find a match. Swap the operands and try again.
     std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
     ShuffleVectorSDNode::commuteMask(InLaneMask);
 
     if (MatchMasks(InLaneMask, RepeatMask)) {
       // Merge this lane mask into the final repeat mask.
       MergeMasks(InLaneMask, RepeatMask);
       continue;
     }
 
     // Couldn't find a match with the operands in either order.
     return SDValue();
   }
 
   // Now handle any lanes with only one source.
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
     // If this lane has already been processed, skip it.
     if (LaneSrcs[Lane][0] >= 0)
       continue;
 
     for (int i = 0; i != LaneSize; ++i) {
       int M = Mask[(Lane * LaneSize) + i];
       if (M < 0)
         continue;
 
       // If RepeatMask isn't defined yet we can define it ourself.
       if (RepeatMask[i] < 0)
         RepeatMask[i] = M % LaneSize;
 
       if (RepeatMask[i] < Size) {
         if (RepeatMask[i] != M % LaneSize)
           return SDValue();
         LaneSrcs[Lane][0] = M / LaneSize;
       } else {
         if (RepeatMask[i] != ((M % LaneSize) + Size))
           return SDValue();
         LaneSrcs[Lane][1] = M / LaneSize;
       }
     }
 
     if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
       return SDValue();
   }
 
   SmallVector<int, 16> NewMask(Size, -1);
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
     int Src = LaneSrcs[Lane][0];
     for (int i = 0; i != LaneSize; ++i) {
       int M = -1;
       if (Src >= 0)
         M = Src * LaneSize + i;
       NewMask[Lane * LaneSize + i] = M;
     }
   }
   SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
   // Ensure we didn't get back the shuffle we started with.
   // FIXME: This is a hack to make up for some splat handling code in
   // getVectorShuffle.
   if (isa<ShuffleVectorSDNode>(NewV1) &&
       cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
     return SDValue();
 
   for (int Lane = 0; Lane != NumLanes; ++Lane) {
     int Src = LaneSrcs[Lane][1];
     for (int i = 0; i != LaneSize; ++i) {
       int M = -1;
       if (Src >= 0)
         M = Src * LaneSize + i;
       NewMask[Lane * LaneSize + i] = M;
     }
   }
   SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
   // Ensure we didn't get back the shuffle we started with.
   // FIXME: This is a hack to make up for some splat handling code in
   // getVectorShuffle.
   if (isa<ShuffleVectorSDNode>(NewV2) &&
       cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
     return SDValue();
 
   for (int i = 0; i != Size; ++i) {
     NewMask[i] = RepeatMask[i % LaneSize];
     if (NewMask[i] < 0)
       continue;
 
     NewMask[i] += (i / LaneSize) * LaneSize;
   }
   return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
 }
 
 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
 /// This allows for fast cases such as subvector extraction/insertion
 /// or shuffling smaller vector types which can lower more efficiently.
 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
                                                SDValue V1, SDValue V2,
                                                ArrayRef<int> Mask,
                                                const X86Subtarget &Subtarget,
                                                SelectionDAG &DAG) {
   assert((VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected 256-bit or 512-bit vector");
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned HalfNumElts = NumElts / 2;
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
 
   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
   if (!UndefLower && !UndefUpper)
     return SDValue();
 
   // Upper half is undef and lower half is whole upper subvector.
   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
   if (UndefUpper &&
       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
                              DAG.getIntPtrConstant(HalfNumElts, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   // Lower half is undef and upper half is whole lower subvector.
   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
   if (UndefLower &&
       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
                              DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
                        DAG.getIntPtrConstant(HalfNumElts, DL));
   }
 
   // If the shuffle only uses two of the four halves of the input operands,
   // then extract them and perform the 'half' shuffle at half width.
   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
   int HalfIdx1 = -1, HalfIdx2 = -1;
   SmallVector<int, 8> HalfMask(HalfNumElts);
   unsigned Offset = UndefLower ? HalfNumElts : 0;
   for (unsigned i = 0; i != HalfNumElts; ++i) {
     int M = Mask[i + Offset];
     if (M < 0) {
       HalfMask[i] = M;
       continue;
     }
 
     // Determine which of the 4 half vectors this element is from.
     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
     int HalfIdx = M / HalfNumElts;
 
     // Determine the element index into its half vector source.
     int HalfElt = M % HalfNumElts;
 
     // We can shuffle with up to 2 half vectors, set the new 'half'
     // shuffle mask accordingly.
     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
       HalfMask[i] = HalfElt;
       HalfIdx1 = HalfIdx;
       continue;
     }
     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
       HalfMask[i] = HalfElt + HalfNumElts;
       HalfIdx2 = HalfIdx;
       continue;
     }
 
     // Too many half vectors referenced.
     return SDValue();
   }
   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
 
   // Only shuffle the halves of the inputs when useful.
   int NumLowerHalves =
       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
   int NumUpperHalves =
       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
 
   // uuuuXXXX - don't extract uppers just to insert again.
   if (UndefLower && NumUpperHalves != 0)
     return SDValue();
 
   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
   if (UndefUpper && NumUpperHalves == 2)
     return SDValue();
 
   // AVX2 - XXXXuuuu - always extract lowers.
   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
     if (VT == MVT::v4f64 || VT == MVT::v4i64)
       return SDValue();
     // AVX2 supports variable 32-bit element cross-lane shuffles.
     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
       // XXXXuuuu - don't extract lowers and uppers.
       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
         return SDValue();
     }
   }
 
   // AVX512 - XXXXuuuu - always extract lowers.
   if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
     return SDValue();
 
   auto GetHalfVector = [&](int HalfIdx) {
     if (HalfIdx < 0)
       return DAG.getUNDEF(HalfVT);
     SDValue V = (HalfIdx < 2 ? V1 : V2);
     HalfIdx = (HalfIdx % 2) * HalfNumElts;
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
                        DAG.getIntPtrConstant(HalfIdx, DL));
   };
 
   SDValue Half1 = GetHalfVector(HalfIdx1);
   SDValue Half2 = GetHalfVector(HalfIdx2);
   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
                      DAG.getIntPtrConstant(Offset, DL));
 }
 
 /// Test whether the specified input (0 or 1) is in-place blended by the
 /// given mask.
 ///
 /// This returns true if the elements from a particular input are already in the
 /// slot required by the given mask and require no permutation.
 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
   int Size = Mask.size();
   for (int i = 0; i < Size; ++i)
     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
       return false;
 
   return true;
 }
 
 /// Handle case where shuffle sources are coming from the same 128-bit lane and
 /// every lane can be represented as the same repeating mask - allowing us to
 /// shuffle the sources with the repeating shuffle and then permute the result
 /// to the destination lanes.
 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
   int NumElts = VT.getVectorNumElements();
   int NumLanes = VT.getSizeInBits() / 128;
   int NumLaneElts = NumElts / NumLanes;
 
   // On AVX2 we may be able to just shuffle the lowest elements and then
   // broadcast the result.
   if (Subtarget.hasAVX2()) {
     for (unsigned BroadcastSize : {16, 32, 64}) {
       if (BroadcastSize <= VT.getScalarSizeInBits())
         continue;
       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
 
       // Attempt to match a repeating pattern every NumBroadcastElts,
       // accounting for UNDEFs but only references the lowest 128-bit
       // lane of the inputs.
       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
         for (int i = 0; i != NumElts; i += NumBroadcastElts)
           for (int j = 0; j != NumBroadcastElts; ++j) {
             int M = Mask[i + j];
             if (M < 0)
               continue;
             int &R = RepeatMask[j];
             if (0 != ((M % NumElts) / NumLaneElts))
               return false;
             if (0 <= R && R != M)
               return false;
             R = M;
           }
         return true;
       };
 
       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
       if (!FindRepeatingBroadcastMask(RepeatMask))
         continue;
 
       // Shuffle the (lowest) repeated elements in place for broadcast.
       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
 
       // Shuffle the actual broadcast.
       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
       for (int i = 0; i != NumElts; i += NumBroadcastElts)
         for (int j = 0; j != NumBroadcastElts; ++j)
           BroadcastMask[i + j] = j;
       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
                                   BroadcastMask);
     }
   }
 
   // Bail if the shuffle mask doesn't cross 128-bit lanes.
   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
     return SDValue();
 
   // Bail if we already have a repeated lane shuffle mask.
   SmallVector<int, 8> RepeatedShuffleMask;
   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
     return SDValue();
 
   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
   int NumSubLanes = NumLanes * SubLaneScale;
   int NumSubLaneElts = NumLaneElts / SubLaneScale;
 
   // Check that all the sources are coming from the same lane and see if we can
   // form a repeating shuffle mask (local to each sub-lane). At the same time,
   // determine the source sub-lane for each destination sub-lane.
   int TopSrcSubLane = -1;
   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
 
   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
     // Extract the sub-lane mask, check that it all comes from the same lane
     // and normalize the mask entries to come from the first lane.
     int SrcLane = -1;
     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
       if (M < 0)
         continue;
       int Lane = (M % NumElts) / NumLaneElts;
       if ((0 <= SrcLane) && (SrcLane != Lane))
         return SDValue();
       SrcLane = Lane;
       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
       SubLaneMask[Elt] = LocalM;
     }
 
     // Whole sub-lane is UNDEF.
     if (SrcLane < 0)
       continue;
 
     // Attempt to match against the candidate repeated sub-lane masks.
     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
         for (int i = 0; i != NumSubLaneElts; ++i) {
           if (M1[i] < 0 || M2[i] < 0)
             continue;
           if (M1[i] != M2[i])
             return false;
         }
         return true;
       };
 
       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
         continue;
 
       // Merge the sub-lane mask into the matching repeated sub-lane mask.
       for (int i = 0; i != NumSubLaneElts; ++i) {
         int M = SubLaneMask[i];
         if (M < 0)
           continue;
         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
                "Unexpected mask element");
         RepeatedSubLaneMask[i] = M;
       }
 
       // Track the top most source sub-lane - by setting the remaining to UNDEF
       // we can greatly simplify shuffle matching.
       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
       break;
     }
 
     // Bail if we failed to find a matching repeated sub-lane mask.
     if (Dst2SrcSubLanes[DstSubLane] < 0)
       return SDValue();
   }
   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
          "Unexpected source lane");
 
   // Create a repeating shuffle mask for the entire vector.
   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
     int Lane = SubLane / SubLaneScale;
     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
       int M = RepeatedSubLaneMask[Elt];
       if (M < 0)
         continue;
       int Idx = (SubLane * NumSubLaneElts) + Elt;
       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
     }
   }
   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
 
   // Shuffle each source sub-lane to its destination.
   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
     if (SrcSubLane < 0)
       continue;
     for (int j = 0; j != NumSubLaneElts; ++j)
       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
   }
 
   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
                               SubLaneMask);
 }
 
 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
                                          unsigned &ShuffleImm,
                                          ArrayRef<int> Mask) {
   int NumElts = VT.getVectorNumElements();
   assert(VT.getScalarSizeInBits() == 64 &&
          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected data type for VSHUFPD");
 
   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
   ShuffleImm = 0;
   bool ShufpdMask = true;
   bool CommutableMask = true;
   for (int i = 0; i < NumElts; ++i) {
     if (Mask[i] == SM_SentinelUndef)
       continue;
     if (Mask[i] < 0)
       return false;
     int Val = (i & 6) + NumElts * (i & 1);
     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
     if (Mask[i] < Val || Mask[i] > Val + 1)
       ShufpdMask = false;
     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
       CommutableMask = false;
     ShuffleImm |= (Mask[i] % 2) << i;
   }
 
   if (ShufpdMask)
     return true;
   if (CommutableMask) {
     std::swap(V1, V2);
     return true;
   }
 
   return false;
 }
 
 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
          "Unexpected data type for VSHUFPD");
 
   unsigned Immediate = 0;
   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
     return SDValue();
 
   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
                      DAG.getConstant(Immediate, DL, MVT::i8));
 }
 
 /// Handle lowering of 4-lane 64-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
 
   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
                                            Zeroable, Subtarget, DAG))
     return V;
 
   if (V2.isUndef()) {
     // Check for being able to broadcast a single element.
     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     // Use low duplicate instructions for masks that match their pattern.
     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
 
     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
       // Non-half-crossing single input shuffles can be lowered with an
       // interleaved permutation.
       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
     }
 
     // With AVX2 we have direct support for this permutation.
     if (Subtarget.hasAVX2())
       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
 
     // Try to create an in-lane repeating shuffle mask and then shuffle the
     // results into the target lanes.
     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return V;
 
     // Try to permute the lanes and then use a per-lane permute.
     if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
             DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget))
       return V;
 
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                    DAG, Subtarget);
   }
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Check if the blend happens to exactly fit that of SHUFPD.
   if (SDValue Op =
       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
     return Op;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
   // instruction so skip this pattern.
   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
                                 isShuffleMaskInputInPlace(1, Mask))))
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
 
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
                                                       Mask, Subtarget, DAG);
 
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
                                           Subtarget, DAG);
 }
 
 /// Handle lowering of 4-lane 64-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v4i64 shuffling..
 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
 
   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
                                            Zeroable, Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   if (V2.isUndef()) {
     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
     // can use lower latency instructions that will operate on both lanes.
     SmallVector<int, 2> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
       SmallVector<int, 4> PSHUFDMask;
       scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v4i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
                       DAG.getBitcast(MVT::v8i32, V1),
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
     }
 
     // AVX2 provides a direct instruction for permuting a single input across
     // lanes.
     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
   }
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // If we have VLX support, we can use VALIGN or VEXPAND.
   if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
   }
 
   // Try to use PALIGNR.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
                                                       Mask, Subtarget, DAG))
     return Rotate;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
     return V;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle. However, if we have AVX2 and either inputs are already in place,
   // we will be able to shuffle even across lanes the other input in a single
   // instruction so skip this pattern.
   if (!isShuffleMaskInputInPlace(0, Mask) &&
       !isShuffleMaskInputInPlace(1, Mask))
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
       return Result;
 
   // Otherwise fall back on generic blend lowering.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
                                                     Mask, Subtarget, DAG);
 }
 
 /// Handle lowering of 8-lane 32-bit floating point shuffles.
 ///
 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   // options to efficiently lower the shuffle.
   SmallVector<int, 4> RepeatedMask;
   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
     assert(RepeatedMask.size() == 4 &&
            "Repeated masks must be half the mask width!");
 
     // Use even/odd duplicate instructions for masks that match their pattern.
     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
 
     if (V2.isUndef())
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
       return V;
 
     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
     // have already handled any direct blends.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
   }
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // If we have a single input shuffle with different shuffle patterns in the
   // two 128-bit lanes use the variable mask to VPERMILPS.
   if (V2.isUndef()) {
     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
 
     if (Subtarget.hasAVX2())
       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
 
     // Otherwise, fall back.
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
                                                    DAG, Subtarget);
   }
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return Result;
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
 
   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
   // since after split we get a more efficient code using vpunpcklwd and
   // vpunpckhwd instrs than vblend.
   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
     if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
                                                      Mask, Subtarget, DAG))
       return V;
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
                                                       Mask, Subtarget, DAG);
 
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
                                           Subtarget, DAG);
 }
 
 /// Handle lowering of 8-lane 32-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v8i32 shuffling..
 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
   // since after split we get a more efficient code than vblend by using
   // vpunpcklwd and vpunpckhwd instrs.
   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
       !Subtarget.hasAVX512())
     if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2,
                                                      Mask, Subtarget, DAG))
       return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   // If the shuffle mask is repeated in each 128-bit lane we can use more
   // efficient instructions that mirror the shuffles across the two 128-bit
   // lanes.
   SmallVector<int, 4> RepeatedMask;
   bool Is128BitLaneRepeatedShuffle =
       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
   if (Is128BitLaneRepeatedShuffle) {
     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
     if (V2.isUndef())
       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
       return V;
   }
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // If we have VLX support, we can use VALIGN or EXPAND.
   if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
   }
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // If the shuffle patterns aren't repeated but it is a single input, directly
   // generate a cross-lane VPERMD instruction.
   if (V2.isUndef()) {
     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
   }
 
   // Assume that a single SHUFPS is faster than an alternative sequence of
   // multiple instructions (even if the CPU has a domain penalty).
   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
                                                   CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v8i32, ShufPS);
   }
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
   // Otherwise fall back on generic blend lowering.
   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
                                                     Mask, Subtarget, DAG);
 }
 
 /// Handle lowering of 16-lane 16-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v16i16 shuffling..
 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
     return V;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   if (V2.isUndef()) {
     // There are no generalized cross-lane shuffle operations available on i16
     // element types.
     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
       if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
               DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
         return V;
 
       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
                                                      Mask, DAG, Subtarget);
     }
 
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
       // As this is a single-input shuffle, the repeated mask should be
       // a strictly valid v8i16 mask that we can pass through to the v8i16
       // lowering to handle even the v16 case.
       return lowerV8I16GeneralSingleInputVectorShuffle(
           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
     }
   }
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
   // AVX512BWVL can lower to VPERMW.
   if (Subtarget.hasBWI() && Subtarget.hasVLX())
     return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
   // Try to permute the lanes and then use a per-lane permute.
   if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
           DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
     return V;
 
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
                                           Subtarget, DAG);
 }
 
 /// Handle lowering of 32-lane 8-bit integer shuffles.
 ///
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v32i8 shuffling..
 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
                                                         Mask, Subtarget, DAG))
     return Broadcast;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
     return V;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // There are no generalized cross-lane shuffle operations available on i8
   // element types.
   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
     if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
       return V;
 
     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
                                                    DAG, Subtarget);
   }
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
   // AVX512VBMIVL can lower to VPERMB.
   if (Subtarget.hasVBMI() && Subtarget.hasVLX())
     return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
     return Result;
 
   // Try to permute the lanes and then use a per-lane permute.
   if (SDValue V = lowerVectorShuffleAsLanePermuteAndPermute(
           DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
     return V;
 
   // Otherwise fall back on generic lowering.
   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
                                           Subtarget, DAG);
 }
 
 /// High-level routine to lower various 256-bit x86 vector shuffles.
 ///
 /// This routine either breaks down the specific type of a 256-bit x86 vector
 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
 /// together based on the available instructions.
 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
                                         const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
   int NumElts = VT.getVectorNumElements();
   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
 
   if (NumV2Elements == 1 && Mask[0] >= NumElts)
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return Insertion;
 
   // Handle special cases where the lower or upper half is UNDEF.
   if (SDValue V =
           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
   // can check for those subtargets here and avoid much of the subtarget
   // querying in the per-vector-type lowering routines. With AVX1 we have
   // essentially *zero* ability to manipulate a 256-bit vector with integer
   // types. Since we'll use floating point types there eventually, just
   // immediately cast everything to a float and operate entirely in that domain.
   if (VT.isInteger() && !Subtarget.hasAVX2()) {
     int ElementBits = VT.getScalarSizeInBits();
     if (ElementBits < 32) {
       // No floating point type available, if we can't use the bit operations
       // for masking/blending then decompose into 128-bit vectors.
       if (SDValue V =
               lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
         return V;
       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
         return V;
       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
     }
 
     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
                                 VT.getVectorNumElements());
     V1 = DAG.getBitcast(FpVT, V1);
     V2 = DAG.getBitcast(FpVT, V2);
     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   }
 
   switch (VT.SimpleTy) {
   case MVT::v4f64:
     return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v4i64:
     return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8f32:
     return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i32:
     return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i16:
     return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v32i8:
     return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Not a valid 256-bit x86 vector type!");
   }
 }
 
 /// Try to lower a vector shuffle as a 128-bit shuffles.
 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
                                         ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(VT.getScalarSizeInBits() == 64 &&
          "Unexpected element type size for 128bit shuffle.");
 
   // To handle 256 bit vector requires VLX and most probably
   // function lowerV2X128VectorShuffle() is better solution.
   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
 
   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
   SmallVector<int, 4> WidenedMask;
   if (!canWidenShuffleElements(Mask, WidenedMask))
     return SDValue();
 
   // Try to use an insert into a zero vector.
   if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
       (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
                               DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                        getZeroVector(VT, Subtarget, DAG, DL), LoV,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   // Check for patterns which can be matched with a single insert of a 256-bit
   // subvector.
   bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
                                         {0, 1, 2, 3, 0, 1, 2, 3});
   if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
                                         {0, 1, 2, 3, 8, 9, 10, 11})) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
     SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
                                  OnlyUsesV1 ? V1 : V2,
                               DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
                        DAG.getIntPtrConstant(4, DL));
   }
 
   assert(WidenedMask.size() == 4);
 
   // See if this is an insertion of the lower 128-bits of V2 into V1.
   bool IsInsert = true;
   int V2Index = -1;
   for (int i = 0; i < 4; ++i) {
     assert(WidenedMask[i] >= -1);
     if (WidenedMask[i] < 0)
       continue;
 
     // Make sure all V1 subvectors are in place.
     if (WidenedMask[i] < 4) {
       if (WidenedMask[i] != i) {
         IsInsert = false;
         break;
       }
     } else {
       // Make sure we only have a single V2 index and its the lowest 128-bits.
       if (V2Index >= 0 || WidenedMask[i] != 4) {
         IsInsert = false;
         break;
       }
       V2Index = i;
     }
   }
   if (IsInsert && V2Index >= 0) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
                                  DAG.getIntPtrConstant(0, DL));
     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
   }
 
   // Try to lower to vshuf64x2/vshuf32x4.
   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
   unsigned PermMask = 0;
   // Insure elements came from the same Op.
   for (int i = 0; i < 4; ++i) {
     assert(WidenedMask[i] >= -1);
     if (WidenedMask[i] < 0)
       continue;
 
     SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
     unsigned OpIndex = i / 2;
     if (Ops[OpIndex].isUndef())
       Ops[OpIndex] = Op;
     else if (Ops[OpIndex] != Op)
       return SDValue();
 
     // Convert the 128-bit shuffle mask selection values into 128-bit selection
     // bits defined by a vshuf64x2 instruction's immediate control byte.
     PermMask |= (WidenedMask[i] % 4) << (i * 2);
   }
 
   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
                      DAG.getConstant(PermMask, DL, MVT::i8));
 }
 
 /// Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   if (V2.isUndef()) {
     // Use low duplicate instructions for masks that match their pattern.
     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
 
     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
       // Non-half-crossing single input shuffles can be lowered with an
       // interleaved permutation.
       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
     }
 
     SmallVector<int, 4> RepeatedMask;
     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
   }
 
   if (SDValue Shuf128 =
           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
                                    Subtarget, DAG))
     return Shuf128;
 
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Unpck;
 
   // Check if the blend happens to exactly fit that of SHUFPD.
   if (SDValue Op =
       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Op;
 
   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
                                              V2, DAG, Subtarget))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
   // If the shuffle mask is repeated in each 128-bit lane, we have many more
   // options to efficiently lower the shuffle.
   SmallVector<int, 4> RepeatedMask;
   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
 
     // Use even/odd duplicate instructions for masks that match their pattern.
     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
 
     if (V2.isUndef())
       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue Unpck =
             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
       return Unpck;
 
     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
                                                   Zeroable, Subtarget, DAG))
       return Blend;
 
     // Otherwise, fall back to a SHUFPS sequence.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
 
   // If we have a single input shuffle with different shuffle patterns in the
   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
   if (V2.isUndef() &&
       !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
     SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
     return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
   }
 
   // If we have AVX512F support, we can use VEXPAND.
   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
                                              V1, V2, DAG, Subtarget))
     return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
 
 /// Handle lowering of 8-lane 64-bit integer shuffles.
 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
 
   if (V2.isUndef()) {
     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
     // can use lower latency instructions that will operate on all four
     // 128-bit lanes.
     SmallVector<int, 2> Repeated128Mask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
       SmallVector<int, 4> PSHUFDMask;
       scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v8i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
                       DAG.getBitcast(MVT::v16i32, V1),
                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
     }
 
     SmallVector<int, 4> Repeated256Mask;
     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
   }
 
   if (SDValue Shuf128 =
           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
                                    V1, V2, Subtarget, DAG))
     return Shuf128;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use VALIGN.
   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
                                                   Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to use PALIGNR.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
                                                       Mask, Subtarget, DAG))
     return Rotate;
 
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Unpck;
   // If we have AVX512F support, we can use VEXPAND.
   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
                                              V2, DAG, Subtarget))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
 
 /// Handle lowering of 16-lane 32-bit integer shuffles.
 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // If the shuffle mask is repeated in each 128-bit lane we can use more
   // efficient instructions that mirror the shuffles across the four 128-bit
   // lanes.
   SmallVector<int, 4> RepeatedMask;
   bool Is128BitLaneRepeatedShuffle =
       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
   if (Is128BitLaneRepeatedShuffle) {
     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
     if (V2.isUndef())
       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
 
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V =
             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
       return V;
   }
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use VALIGN.
   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
                                                   Mask, Subtarget, DAG))
     return Rotate;
 
   // Try to use byte rotation instructions.
   if (Subtarget.hasBWI())
     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
       return Rotate;
 
   // Assume that a single SHUFPS is faster than using a permv shuffle.
   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
                                                   CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v16i32, ShufPS);
   }
   // If we have AVX512F support, we can use VEXPAND.
   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
                                              V1, V2, DAG, Subtarget))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
 
 /// Handle lowering of 32-lane 16-bit integer shuffles.
 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
     return V;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   if (V2.isUndef()) {
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
       // As this is a single-input shuffle, the repeated mask should be
       // a strictly valid v8i16 mask that we can pass through to the v8i16
       // lowering to handle even the v32 case.
       return lowerV8I16GeneralSingleInputVectorShuffle(
           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
     }
   }
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
 }
 
 /// Handle lowering of 64-lane 8-bit integer shuffles.
 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
 
   // Whenever we can lower this as a zext, that instruction is strictly faster
   // than any alternative. It also allows us to fold memory operands into the
   // shuffle in many cases.
   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
     return V;
 
   // Use dedicated pack instructions for masks that match their pattern.
   if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
                                              Subtarget))
     return V;
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
     return Rotate;
 
   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
           DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
     return PSHUFB;
 
   // VBMI can use VPERMV/VPERMV3 byte shuffles.
   if (Subtarget.hasVBMI())
     return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
 
   // FIXME: Implement direct support for this type!
   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
 }
 
 /// High-level routine to lower various 512-bit x86 vector shuffles.
 ///
 /// This routine either breaks down the specific type of a 512-bit x86 vector
 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
 /// together based on the available instructions.
 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
                                         const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
          "Cannot lower 512-bit vectors w/ basic ISA!");
 
   // If we have a single input to the zero element, insert that into V1 if we
   // can do so cheaply.
   int NumElts = Mask.size();
   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
 
   if (NumV2Elements == 1 && Mask[0] >= NumElts)
     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
       return Insertion;
 
   // Handle special cases where the lower or upper half is UNDEF.
   if (SDValue V =
         lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   // Check for being able to broadcast a single element.
   if (SDValue Broadcast =
           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
     return Broadcast;
 
   // Dispatch to each element type for lowering. If we don't have support for
   // specific element type shuffles at 512 bits, immediately split them and
   // lower them. Each lowering routine of a given type is allowed to assume that
   // the requisite ISA extensions for that element type are available.
   switch (VT.SimpleTy) {
   case MVT::v8f64:
     return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16f32:
     return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i64:
     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i32:
     return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v32i16:
     return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v64i8:
     return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
 
   default:
     llvm_unreachable("Not a valid 512-bit x86 vector type!");
   }
 }
 
 // Determine if this shuffle can be implemented with a KSHIFT instruction.
 // Returns the shift amount if possible or -1 if not. This is a simplified
 // version of matchVectorShuffleAsShift.
 static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
                                     int MaskOffset, const APInt &Zeroable) {
   int Size = Mask.size();
 
   auto CheckZeros = [&](int Shift, bool Left) {
     for (int j = 0; j < Shift; ++j)
       if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
         return false;
 
     return true;
   };
 
   auto MatchShift = [&](int Shift, bool Left) {
     unsigned Pos = Left ? Shift : 0;
     unsigned Low = Left ? 0 : Shift;
     unsigned Len = Size - Shift;
     return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
   };
 
   for (int Shift = 1; Shift != Size; ++Shift)
     for (bool Left : {true, false})
       if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
         Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;
         return Shift;
       }
 
   return -1;
 }
 
 
 // Lower vXi1 vector shuffles.
 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
 // vector, shuffle and then truncate it back.
 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                       MVT VT, SDValue V1, SDValue V2,
                                       const APInt &Zeroable,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
          "Cannot lower 512-bit vectors w/o basic ISA!");
 
   unsigned NumElts = Mask.size();
 
   // Try to recognize shuffles that are just padding a subvector with zeros.
   unsigned SubvecElts = 0;
   for (int i = 0; i != (int)NumElts; ++i) {
     if (Mask[i] >= 0 && Mask[i] != i)
       break;
 
     ++SubvecElts;
   }
   assert(SubvecElts != NumElts && "Identity shuffle?");
 
   // Clip to a power 2.
   SubvecElts = PowerOf2Floor(SubvecElts);
 
   // Make sure the number of zeroable bits in the top at least covers the bits
   // not covered by the subvector.
   if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
     MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
     SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
                                   V1, DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                        getZeroVector(VT, Subtarget, DAG, DL),
                        Extract, DAG.getIntPtrConstant(0, DL));
   }
 
   // Try to match KSHIFTs.
   // TODO: Support narrower than legal shifts by widening and extracting.
   if (NumElts >= 16 || (Subtarget.hasDQI() && NumElts == 8)) {
     unsigned Offset = 0;
     for (SDValue V : { V1, V2 }) {
       unsigned Opcode;
       int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
       if (ShiftAmt >= 0)
         return DAG.getNode(Opcode, DL, VT, V,
                            DAG.getConstant(ShiftAmt, DL, MVT::i8));
       Offset += NumElts; // Increment for next iteration.
     }
   }
 
 
   MVT ExtVT;
   switch (VT.SimpleTy) {
   default:
     llvm_unreachable("Expected a vector of i1 elements");
   case MVT::v2i1:
     ExtVT = MVT::v2i64;
     break;
   case MVT::v4i1:
     ExtVT = MVT::v4i32;
     break;
   case MVT::v8i1:
     // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
     // shuffle.
     ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
     break;
   case MVT::v16i1:
     // Take 512-bit type, unless we are avoiding 512-bit types and have the
     // 256-bit operation available.
     ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
     break;
   case MVT::v32i1:
     // Take 512-bit type, unless we are avoiding 512-bit types and have the
     // 256-bit operation available.
     assert(Subtarget.hasBWI() && "Expected AVX512BW support");
     ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
     break;
   case MVT::v64i1:
     ExtVT = MVT::v64i8;
     break;
   }
 
   V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
   V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
 
   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
   // i1 was sign extended we can use X86ISD::CVT2MASK.
   int NumElems = VT.getVectorNumElements();
   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
       (Subtarget.hasDQI() && (NumElems < 32)))
     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
                        Shuffle, ISD::SETGT);
 
   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
 }
 
 /// Helper function that returns true if the shuffle mask should be
 /// commuted to improve canonicalization.
 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
   int NumElements = Mask.size();
 
   int NumV1Elements = 0, NumV2Elements = 0;
   for (int M : Mask)
     if (M < 0)
       continue;
     else if (M < NumElements)
       ++NumV1Elements;
     else
       ++NumV2Elements;
 
   // Commute the shuffle as needed such that more elements come from V1 than
   // V2. This allows us to match the shuffle pattern strictly on how many
   // elements come from V1 without handling the symmetric cases.
   if (NumV2Elements > NumV1Elements)
     return true;
 
   assert(NumV1Elements > 0 && "No V1 indices");
 
   if (NumV2Elements == 0)
     return false;
 
   // When the number of V1 and V2 elements are the same, try to minimize the
   // number of uses of V2 in the low half of the vector. When that is tied,
   // ensure that the sum of indices for V1 is equal to or lower than the sum
   // indices for V2. When those are equal, try to ensure that the number of odd
   // indices for V1 is lower than the number of odd indices for V2.
   if (NumV1Elements == NumV2Elements) {
     int LowV1Elements = 0, LowV2Elements = 0;
     for (int M : Mask.slice(0, NumElements / 2))
       if (M >= NumElements)
         ++LowV2Elements;
       else if (M >= 0)
         ++LowV1Elements;
     if (LowV2Elements > LowV1Elements)
       return true;
     if (LowV2Elements == LowV1Elements) {
       int SumV1Indices = 0, SumV2Indices = 0;
       for (int i = 0, Size = Mask.size(); i < Size; ++i)
         if (Mask[i] >= NumElements)
           SumV2Indices += i;
         else if (Mask[i] >= 0)
           SumV1Indices += i;
       if (SumV2Indices < SumV1Indices)
         return true;
       if (SumV2Indices == SumV1Indices) {
         int NumV1OddIndices = 0, NumV2OddIndices = 0;
         for (int i = 0, Size = Mask.size(); i < Size; ++i)
           if (Mask[i] >= NumElements)
             NumV2OddIndices += i % 2;
           else if (Mask[i] >= 0)
             NumV1OddIndices += i % 2;
         if (NumV2OddIndices < NumV1OddIndices)
           return true;
       }
     }
   }
 
   return false;
 }
 
 /// Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
 /// vector shuffles. Most of the specific lowering strategies are encapsulated
 /// above in helper routines. The canonicalization attempts to widen shuffles
 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
 /// s.t. only one of the two inputs needs to be tested, etc.
 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   ArrayRef<int> Mask = SVOp->getMask();
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   MVT VT = Op.getSimpleValueType();
   int NumElements = VT.getVectorNumElements();
   SDLoc DL(Op);
   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
 
   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
          "Can't lower MMX shuffles");
 
   bool V1IsUndef = V1.isUndef();
   bool V2IsUndef = V2.isUndef();
   if (V1IsUndef && V2IsUndef)
     return DAG.getUNDEF(VT);
 
   // When we create a shuffle node we put the UNDEF node to second operand,
   // but in some cases the first operand may be transformed to UNDEF.
   // In this case we should just commute the node.
   if (V1IsUndef)
     return DAG.getCommutedVectorShuffle(*SVOp);
 
   // Check for non-undef masks pointing at an undef vector and make the masks
   // undef as well. This makes it easier to match the shuffle based solely on
   // the mask.
   if (V2IsUndef)
     for (int M : Mask)
       if (M >= NumElements) {
         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
         for (int &M : NewMask)
           if (M >= NumElements)
             M = -1;
         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
       }
 
   // Check for illegal shuffle mask element index values.
   int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
   assert(llvm::all_of(Mask,
                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
          "Out of bounds shuffle index");
 
   // We actually see shuffles that are entirely re-arrangements of a set of
   // zero inputs. This mostly happens while decomposing complex shuffles into
   // simple ones. Directly lower these as a buildvector of zeros.
   APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
   if (Zeroable.isAllOnesValue())
     return getZeroVector(VT, Subtarget, DAG, DL);
 
   bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
 
   // Create an alternative mask with info about zeroable elements.
   // Here we do not set undef elements as zeroable.
   SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
   if (V2IsZero) {
     assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
     for (int i = 0; i != NumElements; ++i)
       if (Mask[i] != SM_SentinelUndef && Zeroable[i])
         ZeroableMask[i] = SM_SentinelZero;
   }
 
   // Try to collapse shuffles into using a vector type with fewer elements but
   // wider element types. We cap this to not form integers or floating point
   // elements wider than 64 bits, but it might be interesting to form i128
   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
   SmallVector<int, 16> WidenedMask;
   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
       canWidenShuffleElements(ZeroableMask, WidenedMask)) {
     // Shuffle mask widening should not interfere with a broadcast opportunity
     // by obfuscating the operands with bitcasts.
     // TODO: Avoid lowering directly from this top-level function: make this
     // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
     if (SDValue Broadcast =
             lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
       return Broadcast;
 
     MVT NewEltVT = VT.isFloatingPoint()
                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
     int NewNumElts = NumElements / 2;
     MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
     // Make sure that the new vector type is legal. For example, v2f64 isn't
     // legal on SSE1.
     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
       if (V2IsZero) {
         // Modify the new Mask to take all zeros from the all-zero vector.
         // Choose indices that are blend-friendly.
         bool UsedZeroVector = false;
         assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() &&
                "V2's non-undef elements are used?!");
         for (int i = 0; i != NewNumElts; ++i)
           if (WidenedMask[i] == SM_SentinelZero) {
             WidenedMask[i] = i + NewNumElts;
             UsedZeroVector = true;
           }
         // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
         // some elements to be undef.
         if (UsedZeroVector)
           V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
       }
       V1 = DAG.getBitcast(NewVT, V1);
       V2 = DAG.getBitcast(NewVT, V2);
       return DAG.getBitcast(
           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
     }
   }
 
   // Commute the shuffle if it will improve canonicalization.
   if (canonicalizeShuffleMaskWithCommute(Mask))
     return DAG.getCommutedVectorShuffle(*SVOp);
 
   if (SDValue V =
           lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
     return V;
 
   // For each vector width, delegate to a specialized lowering routine.
   if (VT.is128BitVector())
     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
                                     DAG);
 
   if (VT.is256BitVector())
     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
                                     DAG);
 
   if (VT.is512BitVector())
     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
                                     DAG);
 
   if (Is1BitVector)
     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
                                   DAG);
 
   llvm_unreachable("Unimplemented!");
 }
 
 /// Try to lower a VSELECT instruction to a vector shuffle.
 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
                                            const X86Subtarget &Subtarget,
                                            SelectionDAG &DAG) {
   SDValue Cond = Op.getOperand(0);
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
 
   // Only non-legal VSELECTs reach this lowering, convert those into generic
   // shuffles and re-use the shuffle lowering path for blends.
   SmallVector<int, 32> Mask;
   if (createShuffleMaskFromVSELECT(Mask, Cond))
     return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
 
   return SDValue();
 }
 
 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
 
   // A vselect where all conditions and data are constants can be optimized into
   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
       ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&
       ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))
     return SDValue();
 
   // Try to lower this to a blend-style vector shuffle. This can handle all
   // constant condition cases.
   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
     return BlendOp;
 
   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
   // with patterns on the mask registers on AVX-512.
   MVT CondVT = Cond.getSimpleValueType();
   unsigned CondEltSize = Cond.getScalarValueSizeInBits();
   if (CondEltSize == 1)
     return Op;
 
   // Variable blends are only legal from SSE4.1 onward.
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   unsigned EltSize = VT.getScalarSizeInBits();
   unsigned NumElts = VT.getVectorNumElements();
 
   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
   // into an i1 condition so that we can use the mask-based 512-bit blend
   // instructions.
   if (VT.getSizeInBits() == 512) {
     // Build a mask by testing the condition against zero.
     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
     SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
                                 DAG.getConstant(0, dl, CondVT),
                                 ISD::SETNE);
     // Now return a new VSELECT using the mask.
     return DAG.getSelect(dl, VT, Mask, LHS, RHS);
   }
 
   // SEXT/TRUNC cases where the mask doesn't match the destination size.
   if (CondEltSize != EltSize) {
     // If we don't have a sign splat, rely on the expansion.
     if (CondEltSize != DAG.ComputeNumSignBits(Cond))
       return SDValue();
 
     MVT NewCondSVT = MVT::getIntegerVT(EltSize);
     MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
     Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
     return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
   }
 
   // Only some types will be legal on some subtargets. If we can emit a legal
   // VSELECT-matching blend, return Op, and but if we need to expand, return
   // a null value.
   switch (VT.SimpleTy) {
   default:
     // Most of the vector types have blends past SSE4.1.
     return Op;
 
   case MVT::v32i8:
     // The byte blends for AVX vectors were introduced only in AVX2.
     if (Subtarget.hasAVX2())
       return Op;
 
     return SDValue();
 
   case MVT::v8i16:
   case MVT::v16i16: {
     // Bitcast everything to the vXi8 type and use a vXi8 vselect.
     MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
     Cond = DAG.getBitcast(CastVT, Cond);
     LHS = DAG.getBitcast(CastVT, LHS);
     RHS = DAG.getBitcast(CastVT, RHS);
     SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
     return DAG.getBitcast(VT, Select);
   }
   }
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
                                   Op.getOperand(0), Op.getOperand(1));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   }
 
   if (VT == MVT::f32) {
     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
     // the result back to FR32 register. It's only worth matching if the
     // result has a single use which is a store or a bitcast to i32.  And in
     // the case of a store, it's not worth it if the index is a constant 0,
     // because a MOVSSmr can be used instead, which is smaller and faster.
     if (!Op.hasOneUse())
       return SDValue();
     SDNode *User = *Op.getNode()->use_begin();
     if ((User->getOpcode() != ISD::STORE ||
          isNullConstant(Op.getOperand(1))) &&
         (User->getOpcode() != ISD::BITCAST ||
          User->getValueType(0) != MVT::i32))
       return SDValue();
     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
                                   Op.getOperand(1));
     return DAG.getBitcast(MVT::f32, Extract);
   }
 
   if (VT == MVT::i32 || VT == MVT::i64) {
     // ExtractPS/pextrq works with constant index.
     if (isa<ConstantSDNode>(Op.getOperand(1)))
       return Op;
   }
 
   return SDValue();
 }
 
 /// Extract one bit from mask vector, like v16i1 or v8i1.
 /// AVX-512 feature.
 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   SDValue Vec = Op.getOperand(0);
   SDLoc dl(Vec);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
   MVT EltVT = Op.getSimpleValueType();
 
   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
          "Unexpected vector type in ExtractBitFromMaskVector");
 
   // variable index can't be handled in mask registers,
   // extend vector to VR512/128
   if (!isa<ConstantSDNode>(Idx)) {
     unsigned NumElts = VecVT.getVectorNumElements();
     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
     // than extending to 128/256bit.
     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   }
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal == 0) // the operation is legal
     return Op;
 
   // Extend to natively supported kshift.
   unsigned NumElems = VecVT.getVectorNumElements();
   MVT WideVecVT = VecVT;
   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
                       DAG.getUNDEF(WideVecVT), Vec,
                       DAG.getIntPtrConstant(0, dl));
   }
 
   // Use kshiftr instruction to move to the lower element.
   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
                     DAG.getConstant(IdxVal, dl, MVT::i8));
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 SDValue
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
 
   if (VecVT.getVectorElementType() == MVT::i1)
     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
 
   if (!isa<ConstantSDNode>(Idx)) {
     // Its more profitable to go through memory (1 cycles throughput)
     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
     // IACA tool was used to get performance estimation
     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
     //
     // example : extractelement <16 x i8> %a, i32 %i
     //
     // Block Throughput: 3.00 Cycles
     // Throughput Bottleneck: Port5
     //
     // | Num Of |   Ports pressure in cycles  |    |
     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
     // ---------------------------------------------
     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
     // Total Num Of Uops: 4
     //
     //
     // Block Throughput: 1.00 Cycles
     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
     //
     // |    |  Ports pressure in cycles   |  |
     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
     // ---------------------------------------------------------
     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
     // Total Num Of Uops: 4
 
     return SDValue();
   }
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
     // Get the 128-bit vector.
     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
     MVT EltVT = VecVT.getVectorElementType();
 
     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
 
     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
     // this can be done with a mask.
     IdxVal &= ElemsPerChunk - 1;
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
                        DAG.getConstant(IdxVal, dl, MVT::i32));
   }
 
   assert(VecVT.is128BitVector() && "Unexpected vector length");
 
   MVT VT = Op.getSimpleValueType();
 
   if (VT.getSizeInBits() == 16) {
     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
     // we're going to zero extend the register or fold the store (SSE41 only).
     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
 
     // Transform it so it match pextrw which produces a 32-bit result.
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
                                   Op.getOperand(0), Op.getOperand(1));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   }
 
   if (Subtarget.hasSSE41())
     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
       return Res;
 
   // TODO: We only extract a single element from v16i8, we can probably afford
   // to be more aggressive here before using the default approach of spilling to
   // stack.
   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
     // Extract either the lowest i32 or any i16, and extract the sub-byte.
     int DWordIdx = IdxVal / 4;
     if (DWordIdx == 0) {
       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                 DAG.getBitcast(MVT::v4i32, Vec),
                                 DAG.getIntPtrConstant(DWordIdx, dl));
       int ShiftVal = (IdxVal % 4) * 8;
       if (ShiftVal != 0)
         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
                           DAG.getConstant(ShiftVal, dl, MVT::i8));
       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
     }
 
     int WordIdx = IdxVal / 2;
     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
                               DAG.getBitcast(MVT::v8i16, Vec),
                               DAG.getIntPtrConstant(WordIdx, dl));
     int ShiftVal = (IdxVal % 2) * 8;
     if (ShiftVal != 0)
       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
                         DAG.getConstant(ShiftVal, dl, MVT::i8));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
   }
 
   if (VT.getSizeInBits() == 32) {
     if (IdxVal == 0)
       return Op;
 
     // SHUFPS the element to the lowest double word, then movss.
     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   if (VT.getSizeInBits() == 64) {
     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
     //        to match extract_elt for f64.
     if (IdxVal == 0)
       return Op;
 
     // UNPCKHPD the element to the lowest double word, then movsd.
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
     int Mask[2] = { 1, -1 };
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   return SDValue();
 }
 
 /// Insert one bit to mask vector, like v16i1 or v8i1.
 /// AVX-512 feature.
 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue Elt = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
   MVT VecVT = Vec.getSimpleValueType();
 
   if (!isa<ConstantSDNode>(Idx)) {
     // Non constant index. Extend source and destination,
     // insert element and then truncate the result.
     unsigned NumElts = VecVT.getVectorNumElements();
     MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
     MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
       DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
   }
 
   // Copy into a k-register, extract to v1i1 and insert_subvector.
   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
 
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
                      Op.getOperand(2));
 }
 
 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                   SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
 
   if (EltVT == MVT::i1)
     return InsertBitToMaskVector(Op, DAG, Subtarget);
 
   SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
   if (!isa<ConstantSDNode>(N2))
     return SDValue();
   auto *N2C = cast<ConstantSDNode>(N2);
   unsigned IdxVal = N2C->getZExtValue();
 
   bool IsZeroElt = X86::isZeroNode(N1);
   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
 
   // If we are inserting a element, see if we can do this more efficiently with
   // a blend shuffle with a rematerializable vector than a costly integer
   // insertion.
   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
       16 <= EltVT.getSizeInBits()) {
     SmallVector<int, 8> BlendMask;
     for (unsigned i = 0; i != NumElts; ++i)
       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
                                   : getOnesVector(VT, DAG, dl);
     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
   }
 
   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
   // into that, and then insert the subvector back into the result.
   if (VT.is256BitVector() || VT.is512BitVector()) {
     // With a 256-bit vector, we can insert into the zero element efficiently
     // using a blend if we have AVX or AVX2 and the right data type.
     if (VT.is256BitVector() && IdxVal == 0) {
       // TODO: It is worthwhile to cast integer to floating point and back
       // and incur a domain crossing penalty if that's what we'll end up
       // doing anyway after extracting to a 128-bit vector.
       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
         N2 = DAG.getIntPtrConstant(1, dl);
         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
       }
     }
 
     // Get the desired 128-bit vector chunk.
     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired chunk.
     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
     assert(isPowerOf2_32(NumEltsIn128));
     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
 
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
                     DAG.getConstant(IdxIn128, dl, MVT::i32));
 
     // Insert the changed part back into the bigger vector
     return insert128BitVector(N0, V, IdxVal, DAG, dl);
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
   // argument. SSE41 required for pinsrb.
   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
     unsigned Opc;
     if (VT == MVT::v8i16) {
       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
       Opc = X86ISD::PINSRW;
     } else {
       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
       Opc = X86ISD::PINSRB;
     }
 
     if (N1.getValueType() != MVT::i32)
       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
     if (N2.getValueType() != MVT::i32)
       N2 = DAG.getIntPtrConstant(IdxVal, dl);
     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
   }
 
   if (Subtarget.hasSSE41()) {
     if (EltVT == MVT::f32) {
       // Bits [7:6] of the constant are the source select. This will always be
       //   zero here. The DAG Combiner may combine an extract_elt index into
       //   these bits. For example (insert (extract, 3), 2) could be matched by
       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
       // Bits [5:4] of the constant are the destination select. This is the
       //   value of the incoming immediate.
       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
       //   combine either bitwise AND or insert of float 0.0 to set these bits.
 
       bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
         // If this is an insertion of 32-bits into the low 32-bits of
         // a vector, we prefer to generate a blend with immediate rather
         // than an insertps. Blends are simpler operations in hardware and so
         // will always have equal or better performance than insertps.
         // But if optimizing for size and there's a load folding opportunity,
         // generate insertps because blendps does not have a 32-bit memory
         // operand form.
         N2 = DAG.getIntPtrConstant(1, dl);
         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
       }
       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
       // Create this as a scalar to vector..
       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
     }
 
     // PINSR* works with constant index.
     if (EltVT == MVT::i32 || EltVT == MVT::i64)
       return Op;
   }
 
   return SDValue();
 }
 
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT OpVT = Op.getSimpleValueType();
 
   // It's always cheaper to replace a xor+movd with xorps and simplifies further
   // combines.
   if (X86::isZeroNode(Op.getOperand(0)))
     return getZeroVector(OpVT, Subtarget, DAG, dl);
 
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
   if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
                                  OpVT.getVectorNumElements() / SizeFactor);
 
     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
 
     // Insert the 128-bit vector.
     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
   }
   assert(OpVT.is128BitVector() && "Expected an SSE type!");
 
   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
   if (OpVT == MVT::v4i32)
     return Op;
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
   return DAG.getBitcast(
       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
 }
 
 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
 // simple superregister reference or explicit instructions to insert
 // the upper bits of a vector.
 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
 
   return insert1BitVector(Op, DAG, Subtarget);
 }
 
 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
          "Only vXi1 extract_subvectors need custom lowering");
 
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
 
   if (!isa<ConstantSDNode>(Idx))
     return SDValue();
 
   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal == 0) // the operation is legal
     return Op;
 
   MVT VecVT = Vec.getSimpleValueType();
   unsigned NumElems = VecVT.getVectorNumElements();
 
   // Extend to natively supported kshift.
   MVT WideVecVT = VecVT;
   if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
     WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
                       DAG.getUNDEF(WideVecVT), Vec,
                       DAG.getIntPtrConstant(0, dl));
   }
 
   // Shift to the LSB.
   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
                     DAG.getConstant(IdxVal, dl, MVT::i8));
 
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
 unsigned X86TargetLowering::getGlobalWrapperKind(
     const GlobalValue *GV, const unsigned char OpFlags) const {
   // References to absolute symbols are never PC-relative.
   if (GV && GV->isAbsoluteSymbolRef())
     return X86ISD::Wrapper;
 
   CodeModel::Model M = getTargetMachine().getCodeModel();
   if (Subtarget.isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
     return X86ISD::WrapperRIP;
 
   // GOTPCREL references must always use RIP.
   if (OpFlags == X86II::MO_GOTPCREL)
     return X86ISD::WrapperRIP;
 
   return X86ISD::Wrapper;
 }
 
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOV32ri.
 SDValue
 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetConstantPool(
       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
   SDLoc DL(CP);
   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
   // With PIC, the address is actually $g + Offset.
   if (OpFlag) {
     Result =
         DAG.getNode(ISD::ADD, DL, PtrVT,
                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   }
 
   return Result;
 }
 
 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
   SDLoc DL(JT);
   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (OpFlag)
     Result =
         DAG.getNode(ISD::ADD, DL, PtrVT,
                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
 
   return Result;
 }
 
 SDValue
 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
 
   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
   // global base reg.
   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
 
   SDLoc DL(Op);
   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (OpFlag) {
     Result =
         DAG.getNode(ISD::ADD, DL, PtrVT,
                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
   }
 
   // For symbols that require a load from a stub to get the address, emit the
   // load.
   if (isGlobalStubReference(OpFlag))
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 
   return Result;
 }
 
 SDValue
 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   // Create the TargetBlockAddressAddress node.
   unsigned char OpFlags =
     Subtarget.classifyBlockAddressReference();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   SDLoc dl(Op);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   }
 
   return Result;
 }
 
 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
                                               const SDLoc &dl, int64_t Offset,
                                               SelectionDAG &DAG) const {
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
   CodeModel::Model M = DAG.getTarget().getCodeModel();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
   if (OpFlags == X86II::MO_NO_FLAG &&
       X86::isOffsetSuitableForCodeModel(Offset, M)) {
     // A direct static reference to a global.
     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
     Offset = 0;
   } else {
     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
   }
 
   Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
 
   // With PIC, the address is actually $g + Offset.
   if (isGlobalRelativeToPICBase(OpFlags)) {
     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
   }
 
   // For globals that require a load from a stub to get the address, emit the
   // load.
   if (isGlobalStubReference(OpFlags))
     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
 
   // If there was a non-zero offset that we didn't fold, create an explicit
   // addition for it.
   if (Offset != 0)
     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
                          DAG.getConstant(Offset, dl, PtrVT));
 
   return Result;
 }
 
 SDValue
 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
 }
 
 static SDValue
 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
            unsigned char OperandFlags, bool LocalDynamic = false) {
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDLoc dl(GA);
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(),
                                            OperandFlags);
 
   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
                                            : X86ISD::TLSADDR;
 
   if (InFlag) {
     SDValue Ops[] = { Chain,  TGA, *InFlag };
     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   } else {
     SDValue Ops[]  = { Chain, TGA };
     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   }
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   MFI.setAdjustsStack(true);
   MFI.setHasCalls(true);
 
   SDValue Flag = Chain.getValue(1);
   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
 static SDValue
 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
   SDValue InFlag;
   SDLoc dl(GA);  // ? function entry point might be better
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
                                    DAG.getNode(X86ISD::GlobalBaseReg,
                                                SDLoc(), PtrVT), InFlag);
   InFlag = Chain.getValue(1);
 
   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
 static SDValue
 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
                     X86::RAX, X86II::MO_TLSGD);
 }
 
 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
                                            SelectionDAG &DAG,
                                            const EVT PtrVT,
                                            bool is64Bit) {
   SDLoc dl(GA);
 
   // Get the start address of the TLS block for this module.
   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
       .getInfo<X86MachineFunctionInfo>();
   MFI->incNumLocalDynamicTLSAccesses();
 
   SDValue Base;
   if (is64Bit) {
     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   } else {
     SDValue InFlag;
     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
     InFlag = Chain.getValue(1);
     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
   }
 
   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
   // of Base.
 
   // Build x@dtpoff.
   unsigned char OperandFlags = X86II::MO_DTPOFF;
   unsigned WrapperKind = X86ISD::Wrapper;
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   // Add x@dtpoff with the base.
   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
 }
 
 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                    const EVT PtrVT, TLSModel::Model model,
                                    bool is64Bit, bool isPIC) {
   SDLoc dl(GA);
 
   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
                                                          is64Bit ? 257 : 256));
 
   SDValue ThreadPointer =
       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
                   MachinePointerInfo(Ptr));
 
   unsigned char OperandFlags = 0;
   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
   // initialexec.
   unsigned WrapperKind = X86ISD::Wrapper;
   if (model == TLSModel::LocalExec) {
     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
   } else if (model == TLSModel::InitialExec) {
     if (is64Bit) {
       OperandFlags = X86II::MO_GOTTPOFF;
       WrapperKind = X86ISD::WrapperRIP;
     } else {
       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
     }
   } else {
     llvm_unreachable("Unexpected model");
   }
 
   // emit "addl x@ntpoff,%eax" (local exec)
   // or "addl x@indntpoff,%eax" (initial exec)
   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
   SDValue TGA =
       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
                                  GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   if (model == TLSModel::InitialExec) {
     if (isPIC && !is64Bit) {
       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
     }
 
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   }
 
   // The address of the thread local variable is the add of the thread
   // pointer with the offset of the variable.
   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
 }
 
 SDValue
 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   const GlobalValue *GV = GA->getGlobal();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   bool PositionIndependent = isPositionIndependent();
 
   if (Subtarget.isTargetELF()) {
     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
     switch (model) {
       case TLSModel::GeneralDynamic:
         if (Subtarget.is64Bit())
           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
       case TLSModel::LocalDynamic:
         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
                                            Subtarget.is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
                                    PositionIndependent);
     }
     llvm_unreachable("Unknown TLS model.");
   }
 
   if (Subtarget.isTargetDarwin()) {
     // Darwin only has one model of TLS.  Lower to that.
     unsigned char OpFlag = 0;
     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
                            X86ISD::WrapperRIP : X86ISD::Wrapper;
 
     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
     // global base reg.
     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
     if (PIC32)
       OpFlag = X86II::MO_TLVP_PIC_BASE;
     else
       OpFlag = X86II::MO_TLVP;
     SDLoc DL(Op);
     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                 GA->getValueType(0),
                                                 GA->getOffset(), OpFlag);
     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
 
     // With PIC32, the address is actually $g + Offset.
     if (PIC32)
       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
 
     // Lowering the machine isd will make sure everything is in the right
     // location.
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
     SDValue Args[] = { Chain, Offset };
     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
                                DAG.getIntPtrConstant(0, DL, true),
                                Chain.getValue(1), DL);
 
     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
     MFI.setAdjustsStack(true);
 
     // And our return value (tls address) is in the standard call return value
     // location.
     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
   }
 
   if (Subtarget.isTargetKnownWindowsMSVC() ||
       Subtarget.isTargetWindowsItanium() ||
       Subtarget.isTargetWindowsGNU()) {
     // Just use the implicit TLS architecture
     // Need to generate something similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
     //                                  ; from TEB
     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
     //   mov     rcx, qword [rdx+rcx*8]
     //   mov     eax, .tls$:tlsvar
     //   [rax+rcx] contains the address
     // Windows 64bit: gs:0x58
     // Windows 32bit: fs:__tls_array
 
     SDLoc dl(GA);
     SDValue Chain = DAG.getEntryNode();
 
     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
     // use its literal value of 0x2C.
     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
                                         ? Type::getInt8PtrTy(*DAG.getContext(),
                                                              256)
                                         : Type::getInt32PtrTy(*DAG.getContext(),
                                                               257));
 
     SDValue TlsArray = Subtarget.is64Bit()
                            ? DAG.getIntPtrConstant(0x58, dl)
                            : (Subtarget.isTargetWindowsGNU()
                                   ? DAG.getIntPtrConstant(0x2C, dl)
                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
 
     SDValue ThreadPointer =
         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
 
     SDValue res;
     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
       res = ThreadPointer;
     } else {
       // Load the _tls_index variable
       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
       if (Subtarget.is64Bit())
         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
                              MachinePointerInfo(), MVT::i32);
       else
         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
 
       auto &DL = DAG.getDataLayout();
       SDValue Scale =
           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
 
       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
     }
 
     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
 
     // Get the offset of start of .tls section
     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                              GA->getValueType(0),
                                              GA->getOffset(), X86II::MO_SECREL);
     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
 
     // The address of the thread local variable is the add of the thread
     // pointer with the offset of the variable.
     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
   }
 
   llvm_unreachable("TLS not implemented for this target.");
 }
 
 /// Lower SRA_PARTS and friends, which return two i32 values
 /// and take a 2 x i32 value to shift plus a shift amount.
 /// TODO: Can this be moved to general expansion code?
 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   MVT VT = Op.getSimpleValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
   // ISD::FSHL and ISD::FSHR have defined overflow behavior but ISD::SHL and
   // ISD::SRA/L nodes haven't. Insert an AND to be safe, it's optimized away
   // during isel.
   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
                        : DAG.getConstant(0, dl, VT);
 
   SDValue Tmp2, Tmp3;
   if (Op.getOpcode() == ISD::SHL_PARTS) {
     Tmp2 = DAG.getNode(ISD::FSHL, dl, VT, ShOpHi, ShOpLo, ShAmt);
     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   } else {
     Tmp2 = DAG.getNode(ISD::FSHR, dl, VT, ShOpHi, ShOpLo, ShAmt);
     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   }
 
   // If the shift amount is larger or equal than the width of a part we can't
   // rely on the results of shld/shrd. Insert a test and select the appropriate
   // values for large shift amounts.
   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                 DAG.getConstant(VTBits, dl, MVT::i8));
   SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
                              DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
 
   SDValue Hi, Lo;
   if (Op.getOpcode() == ISD::SHL_PARTS) {
     Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
     Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
   } else {
     Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
     Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
   }
 
   return DAG.getMergeValues({ Lo, Hi }, dl);
 }
 
 static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
          "Unexpected funnel shift opcode!");
 
   SDLoc DL(Op);
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue Amt = Op.getOperand(2);
 
   bool IsFSHR = Op.getOpcode() == ISD::FSHR;
 
   if (VT.isVector()) {
     assert(Subtarget.hasVBMI2() && "Expected VBMI2");
 
     if (IsFSHR)
       std::swap(Op0, Op1);
 
     APInt APIntShiftAmt;
     if (isConstantSplat(Amt, APIntShiftAmt)) {
       uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
       return DAG.getNode(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
                          Op0, Op1, DAG.getConstant(ShiftAmt, DL, MVT::i8));
     }
 
     return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
                        Op0, Op1, Amt);
   }
 
   assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
          "Unexpected funnel shift type!");
 
   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
   bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
   if (!OptForSize && Subtarget.isSHLDSlow())
     return SDValue();
 
   if (IsFSHR)
     std::swap(Op0, Op1);
 
   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
   if (VT == MVT::i16)
     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
                       DAG.getConstant(15, DL, Amt.getValueType()));
 
   unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
   return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
 }
 
 // Try to use a packed vector operation to handle i64 on 32-bit targets when
 // AVX512DQ is enabled.
 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   assert((Op.getOpcode() == ISD::SINT_TO_FP ||
           Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
   SDValue Src = Op.getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
   MVT VT = Op.getSimpleValueType();
 
    if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
        (VT != MVT::f32 && VT != MVT::f64))
     return SDValue();
 
   // Pack the i64 into a vector, do the operation and extract.
 
   // Using 256-bit to ensure result is 128-bits for f32 case.
   unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
   MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
   MVT VecVT = MVT::getVectorVT(VT, NumElts);
 
   SDLoc dl(Op);
   SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
   SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue Src = Op.getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   if (SrcVT.isVector()) {
     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
                                      DAG.getUNDEF(SrcVT)));
     }
     return SDValue();
   }
 
   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
 
   // These are really Legal; return the operand so the caller accepts it as
   // Legal.
   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
     return Op;
   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())
     return Op;
 
   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
     return V;
 
   SDValue ValueToStore = Op.getOperand(0);
   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
       !Subtarget.is64Bit())
     // Bitcasting to f64 here allows us to do a single 64-bit store from
     // an SSE register, avoiding the store forwarding penalty that would come
     // with two 32-bit stores.
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
 
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
   auto PtrVT = getPointerTy(MF.getDataLayout());
   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   SDValue Chain = DAG.getStore(
       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
 }
 
 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                      SDValue StackSlot,
                                      SelectionDAG &DAG) const {
   // Build the FILD
   SDLoc DL(Op);
   SDVTList Tys;
   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   if (useSSE)
     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
   else
     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
 
   unsigned ByteSize = SrcVT.getSizeInBits()/8;
 
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
   MachineMemOperand *MMO;
   if (FI) {
     int SSFI = FI->getIndex();
     MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
         MachineMemOperand::MOLoad, ByteSize, ByteSize);
   } else {
     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
     StackSlot = StackSlot.getOperand(1);
   }
   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
                                            X86ISD::FILD, DL,
                                            Tys, Ops, SrcVT, MMO);
 
   if (useSSE) {
     Chain = Result.getValue(1);
     SDValue InFlag = Result.getValue(2);
 
     // FIXME: Currently the FST is glued to the FILD_FLAG. This
     // shouldn't be necessary except that RFP cannot be live across
     // multiple blocks. When stackifier is fixed, they can be uncoupled.
     MachineFunction &MF = DAG.getMachineFunction();
     unsigned SSFISize = Op.getValueSizeInBits()/8;
     int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
     auto PtrVT = getPointerTy(MF.getDataLayout());
     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
     Tys = DAG.getVTList(MVT::Other);
     SDValue Ops[] = {
       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
     };
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
         MachineMemOperand::MOStore, SSFISize, SSFISize);
 
     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
                                     Ops, Op.getValueType(), MMO);
     Result = DAG.getLoad(
         Op.getValueType(), DL, Chain, StackSlot,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
   }
 
   return Result;
 }
 
 /// 64-bit unsigned integer to double expansion.
 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   // This algorithm is not obvious. Here it is what we're trying to output:
   /*
      movq       %rax,  %xmm0
      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
      #ifdef __SSE3__
        haddpd   %xmm0, %xmm0
      #else
        pshufd   $0x4e, %xmm0, %xmm1
        addpd    %xmm1, %xmm0
      #endif
   */
 
   SDLoc dl(Op);
   LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
 
   SmallVector<Constant*,2> CV1;
   CV1.push_back(
     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
                                       APInt(64, 0x4330000000000000ULL))));
   CV1.push_back(
     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
                                       APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
 
   // Load the 64-bit value into an XMM register.
   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                             Op.getOperand(0));
   SDValue CLod0 =
       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
                   /* Alignment = */ 16);
   SDValue Unpck1 =
       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
 
   SDValue CLod1 =
       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
                   /* Alignment = */ 16);
   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
   SDValue Result;
 
   if (Subtarget.hasSSE3()) {
     // FIXME: The 'haddpd' instruction may be slower than 'shuffle + addsd'.
     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
   } else {
     SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
   }
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 /// 32-bit unsigned integer to float expansion.
 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   // FP constant to bias correct the final result.
   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
                                    MVT::f64);
 
   // Load the 32-bit value into an XMM register.
   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
                              Op.getOperand(0));
 
   // Zero out the upper parts of the register.
   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
 
   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                      DAG.getBitcast(MVT::v2f64, Load),
                      DAG.getIntPtrConstant(0, dl));
 
   // Or the load with the bias.
   SDValue Or = DAG.getNode(
       ISD::OR, dl, MVT::v2i64,
       DAG.getBitcast(MVT::v2i64,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
       DAG.getBitcast(MVT::v2i64,
                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
   Or =
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
 
   // Subtract the bias.
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
 
   // Handle final rounding.
   return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
 }
 
 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget,
                                      const SDLoc &DL) {
   if (Op.getSimpleValueType() != MVT::v2f64)
     return SDValue();
 
   SDValue N0 = Op.getOperand(0);
   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
 
   // Legalize to v4i32 type.
   N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
                    DAG.getUNDEF(MVT::v2i32));
 
   if (Subtarget.hasAVX512())
     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
 
   // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
   // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
   SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
   SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
 
   // Two to the power of half-word-size.
   SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
 
   // Clear upper part of LO, lower HI.
   SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
   SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
 
   SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
           fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
   SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
 
   // Add the two halves.
   return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
 }
 
 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   // The algorithm is the following:
   // #ifdef __SSE4_1__
   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
   //                                 (uint4) 0x53000000, 0xaa);
   // #else
   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
   // #endif
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   //     return (float4) lo + fhi;
 
   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
   // reassociate the two FADDs, and if we do that, the algorithm fails
   // spectacularly (PR24512).
   // FIXME: If we ever have some kind of Machine FMF, this should be marked
   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
   // there's also the MachineCombiner reassociations happening on Machine IR.
   if (DAG.getTarget().Options.UnsafeFPMath)
     return SDValue();
 
   SDLoc DL(Op);
   SDValue V = Op->getOperand(0);
   MVT VecIntVT = V.getSimpleValueType();
   bool Is128 = VecIntVT == MVT::v4i32;
   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
   // If we convert to something else than the supported type, e.g., to v4f64,
   // abort early.
   if (VecFloatVT != Op->getSimpleValueType(0))
     return SDValue();
 
   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
          "Unsupported custom type");
 
   // In the #idef/#else code, we have in common:
   // - The vector of constants:
   // -- 0x4b000000
   // -- 0x53000000
   // - A shift:
   // -- v >> 16
 
   // Create the splat vector for 0x4b000000.
   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
   // Create the splat vector for 0x53000000.
   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
 
   // Create the right shift.
   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
 
   SDValue Low, High;
   if (Subtarget.hasSSE41()) {
     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
     // Low will be bitcasted right away, so do not bother bitcasting back to its
     // original type.
     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
     //                                 (uint4) 0x53000000, 0xaa);
     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
     // High will be bitcasted right away, so do not bother bitcasting back to
     // its original type.
     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
   } else {
     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
 
     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
   }
 
   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
   SDValue VecCstFAdd = DAG.getConstantFP(
       APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
 
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue FHigh =
       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
   //     return (float4) lo + fhi;
   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
 }
 
 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   SDValue N0 = Op.getOperand(0);
   MVT SrcVT = N0.getSimpleValueType();
   SDLoc dl(Op);
 
   switch (SrcVT.SimpleTy) {
   default:
     llvm_unreachable("Custom UINT_TO_FP is not supported!");
   case MVT::v2i32:
     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
   case MVT::v4i32:
   case MVT::v8i32:
     assert(!Subtarget.hasAVX512());
     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
   }
 }
 
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Op.getSimpleValueType().isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
 
   MVT SrcVT = N0.getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
     // Conversions from unsigned i32 to f32/f64 are legal,
     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
     return Op;
   }
 
   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
     return V;
 
   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
     return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
     return SDValue();
 
   // Make a 64-bit buffer, and use it to build an FILD.
   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
   if (SrcVT == MVT::i32) {
     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                   StackSlot, MachinePointerInfo());
     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
                                   OffsetSlot, MachinePointerInfo());
     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
     return Fild;
   }
 
   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
   SDValue ValueToStore = Op.getOperand(0);
   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
     // Bitcasting to f64 here allows us to do a single 64-bit store from
     // an SSE register, avoiding the store forwarding penalty that would come
     // with two 32-bit stores.
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
                                MachinePointerInfo());
   // For i64 source, we need to add the appropriate power of 2 if the input
   // was negative.  This is the same as the optimization in
   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   // we must be careful to do the computation in x87 extended precision, not
   // in SSE. (The generic code can't know it's OK to do this, or how to.)
   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
       MachineMemOperand::MOLoad, 8, 8);
 
   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
                                          MVT::i64, MMO);
 
   APInt FF(32, 0x5F800000ULL);
 
   // Check whether the sign bit is set.
   SDValue SignSet = DAG.getSetCC(
       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
 
   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
   SDValue FudgePtr = DAG.getConstantPool(
       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
 
   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   SDValue Zero = DAG.getIntPtrConstant(0, dl);
   SDValue Four = DAG.getIntPtrConstant(4, dl);
   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
 
   // Load the value out, extending it from f32 to f80.
   // FIXME: Avoid the extend by constructing the right constant pool?
   SDValue Fudge = DAG.getExtLoad(
       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
       /* Alignment = */ 4);
   // Extend everything to 80 bits to force it to be done on x87.
   // TODO: Are there any fast-math-flags to propagate here?
   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
 // just return an <SDValue(), SDValue()> pair.
 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
 // to i16, i32 or i64, and we lower it to a legal sequence.
 // If lowered to the final integer result we return a <result, SDValue()> pair.
 // Otherwise we lower it to a sequence ending with a FIST, return a
 // <FIST, StackSlot> pair, and the caller is responsible for loading
 // the final integer result from StackSlot.
 std::pair<SDValue,SDValue>
 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                    bool IsSigned, bool IsReplace) const {
   SDLoc DL(Op);
 
   EVT DstTy = Op.getValueType();
   EVT TheVT = Op.getOperand(0).getValueType();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
     // f16 must be promoted before using the lowering in this routine.
     // fp128 does not use this lowering.
     return std::make_pair(SDValue(), SDValue());
   }
 
   // If using FIST to compute an unsigned i64, we'll need some fixup
   // to handle values above the maximum signed i64.  A FIST is always
   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
   bool UnsignedFixup = !IsSigned &&
                        DstTy == MVT::i64 &&
                        (!Subtarget.is64Bit() ||
                         !isScalarFPTypeInSSEReg(TheVT));
 
   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
     // The low 32 bits of the fist result will have the correct uint32 result.
     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
     DstTy = MVT::i64;
   }
 
   assert(DstTy.getSimpleVT() <= MVT::i64 &&
          DstTy.getSimpleVT() >= MVT::i16 &&
          "Unknown FP_TO_INT to lower!");
 
   // These are really Legal.
   if (DstTy == MVT::i32 &&
       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
     return std::make_pair(SDValue(), SDValue());
   if (Subtarget.is64Bit() &&
       DstTy == MVT::i64 &&
       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
     return std::make_pair(SDValue(), SDValue());
 
   // We lower FP->int64 into FISTP64 followed by a load from a temporary
   // stack slot.
   MachineFunction &MF = DAG.getMachineFunction();
   unsigned MemSize = DstTy.getSizeInBits()/8;
   int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
 
   unsigned Opc;
   switch (DstTy.getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
   }
 
   SDValue Chain = DAG.getEntryNode();
   SDValue Value = Op.getOperand(0);
   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
 
   if (UnsignedFixup) {
     //
     // Conversion to unsigned i64 is implemented with a select,
     // depending on whether the source value fits in the range
     // of a signed i64.  Let Thresh be the FP equivalent of
     // 0x8000000000000000ULL.
     //
     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
     //  Fist-to-mem64 FistSrc
     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
     //  to XOR'ing the high 32 bits with Adjust.
     //
     // Being a power of 2, Thresh is exactly representable in all FP formats.
     // For X87 we'd like to use the smallest FP type for this constant, but
     // for DAG type consistency we have to match the FP operand type.
 
     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
     bool LosesInfo = false;
     if (TheVT == MVT::f64)
       // The rounding mode is irrelevant as the conversion should be exact.
       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
                               &LosesInfo);
     else if (TheVT == MVT::f80)
       Status = Thresh.convert(APFloat::x87DoubleExtended(),
                               APFloat::rmNearestTiesToEven, &LosesInfo);
 
     assert(Status == APFloat::opOK && !LosesInfo &&
            "FP conversion should have been exact");
 
     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
 
     SDValue Cmp = DAG.getSetCC(DL,
                                getSetCCResultType(DAG.getDataLayout(),
                                                   *DAG.getContext(), TheVT),
                                Value, ThreshVal, ISD::SETLT);
     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
                            DAG.getConstant(0, DL, MVT::i32),
                            DAG.getConstant(0x80000000, DL, MVT::i32));
     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
                                               *DAG.getContext(), TheVT),
                        Value, ThreshVal, ISD::SETLT);
     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
   }
 
   // FIXME This causes a redundant load/store if the SSE-class value is already
   // in memory, such as if it is on the callstack.
   if (isScalarFPTypeInSSEReg(TheVT)) {
     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
                          MachinePointerInfo::getFixedStack(MF, SSFI));
     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
     SDValue Ops[] = {
       Chain, StackSlot, DAG.getValueType(TheVT)
     };
 
     MachineMemOperand *MMO =
         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
                                 MachineMemOperand::MOLoad, MemSize, MemSize);
     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
     Chain = Value.getValue(1);
     SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
   }
 
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
                               MachineMemOperand::MOStore, MemSize, MemSize);
 
   if (UnsignedFixup) {
 
     // Insert the FIST, load its result as two i32's,
     // and XOR the high i32 with Adjust.
 
     SDValue FistOps[] = { Chain, Value, StackSlot };
     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
                                            FistOps, DstTy, MMO);
 
     SDValue Low32 =
         DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
 
     SDValue High32 =
         DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
 
     if (Subtarget.is64Bit()) {
       // Join High32 and Low32 into a 64-bit result.
       // (High32 << 32) | Low32
       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
                            DAG.getConstant(32, DL, MVT::i8));
       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
       return std::make_pair(Result, SDValue());
     }
 
     SDValue ResultOps[] = { Low32, High32 };
 
     SDValue pair = IsReplace
       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
       : DAG.getMergeValues(ResultOps, DL);
     return std::make_pair(pair, SDValue());
   } else {
     // Build the FP_TO_INT*_IN_MEM
     SDValue Ops[] = { Chain, Value, StackSlot };
     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
                                            Ops, DstTy, MMO);
     return std::make_pair(FIST, StackSlot);
   }
 }
 
 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
   assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
          "Expected same number of elements");
   assert((VT.getVectorElementType() == MVT::i16 ||
           VT.getVectorElementType() == MVT::i32 ||
           VT.getVectorElementType() == MVT::i64) &&
          "Unexpected element type");
   assert((InVT.getVectorElementType() == MVT::i8 ||
           InVT.getVectorElementType() == MVT::i16 ||
           InVT.getVectorElementType() == MVT::i32) &&
          "Unexpected element type");
 
   // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
   if (InVT == MVT::v8i8) {
     if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
       return SDValue();
 
     In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
                      MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
     // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input.
     return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In);
   }
 
   if (Subtarget.hasInt256())
     return Op;
 
   // Optimize vectors in AVX mode:
   //
   //   v8i16 -> v8i32
   //   Use vpmovzwd for 4 lower elements  v8i16 -> v4i32.
   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
   //   Concat upper and lower parts.
   //
   //   v4i32 -> v4i64
   //   Use vpmovzdq for 4 lower elements  v4i32 -> v2i64.
   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
   //   Concat upper and lower parts.
   //
 
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
 
   SDValue OpLo = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, HalfVT, In);
 
   SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
   SDValue Undef = DAG.getUNDEF(InVT);
   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
   OpHi = DAG.getBitcast(HalfVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
                                    const SDLoc &dl, SelectionDAG &DAG) {
   assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
                            DAG.getIntPtrConstant(0, dl));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
                            DAG.getIntPtrConstant(8, dl));
   Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
   Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
 }
 
 static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
   SDLoc DL(Op);
   unsigned NumElts = VT.getVectorNumElements();
 
   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
   // avoids a constant pool load.
   if (VT.getVectorElementType() != MVT::i8) {
     SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
     return DAG.getNode(ISD::SRL, DL, VT, Extend,
                        DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
   }
 
   // Extend VT if BWI is not supported.
   MVT ExtVT = VT;
   if (!Subtarget.hasBWI()) {
     // If v16i32 is to be avoided, we'll need to split and concatenate.
     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
       return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
 
     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
   }
 
   // Widen to 512-bits if VLX is not supported.
   MVT WideVT = ExtVT;
   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
     NumElts *= 512 / ExtVT.getSizeInBits();
     InVT = MVT::getVectorVT(MVT::i1, NumElts);
     In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
                      In, DAG.getIntPtrConstant(0, DL));
     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
                               NumElts);
   }
 
   SDValue One = DAG.getConstant(1, DL, WideVT);
   SDValue Zero = DAG.getConstant(0, DL, WideVT);
 
   SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
 
   // Truncate if we had to extend above.
   if (VT != ExtVT) {
     WideVT = MVT::getVectorVT(MVT::i8, NumElts);
     SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
   }
 
   // Extract back to 128/256-bit if we widened.
   if (WideVT != VT)
     SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
                               DAG.getIntPtrConstant(0, DL));
 
   return SelectedVal;
 }
 
 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getSimpleValueType();
 
   if (SVT.getVectorElementType() == MVT::i1)
     return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
 
   assert(Subtarget.hasAVX() && "Expected AVX support");
   return LowerAVXExtend(Op, DAG, Subtarget);
 }
 
 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
 /// It makes use of the fact that vectors with enough leading sign/zero bits
 /// prevent the PACKSS/PACKUS from saturating the results.
 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
 /// within each 128-bit lane.
 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
                                       const SDLoc &DL, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
          "Unexpected PACK opcode");
   assert(DstVT.isVector() && "VT not a vector?");
 
   // Requires SSE2 but AVX512 has fast vector truncate.
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   EVT SrcVT = In.getValueType();
 
   // No truncation required, we might get here due to recursive calls.
   if (SrcVT == DstVT)
     return In;
 
   // We only support vector truncation to 64bits or greater from a
   // 128bits or greater source.
   unsigned DstSizeInBits = DstVT.getSizeInBits();
   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
   if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
     return SDValue();
 
   unsigned NumElems = SrcVT.getVectorNumElements();
   if (!isPowerOf2_32(NumElems))
     return SDValue();
 
   LLVMContext &Ctx = *DAG.getContext();
   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
   assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
 
   EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
 
   // Pack to the largest type possible:
   // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
   EVT InVT = MVT::i16, OutVT = MVT::i8;
   if (SrcVT.getScalarSizeInBits() > 16 &&
       (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
     InVT = MVT::i32;
     OutVT = MVT::i16;
   }
 
   // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
   if (SrcVT.is128BitVector()) {
     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
     In = DAG.getBitcast(InVT, In);
     SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
     Res = extractSubVector(Res, 0, DAG, DL, 64);
     return DAG.getBitcast(DstVT, Res);
   }
 
   // Extract lower/upper subvectors.
   unsigned NumSubElts = NumElems / 2;
   SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
   SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
 
   unsigned SubSizeInBits = SrcSizeInBits / 2;
   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
   OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
 
   // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
   if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
     Lo = DAG.getBitcast(InVT, Lo);
     Hi = DAG.getBitcast(InVT, Hi);
     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
     return DAG.getBitcast(DstVT, Res);
   }
 
   // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
   // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
     Lo = DAG.getBitcast(InVT, Lo);
     Hi = DAG.getBitcast(InVT, Hi);
     SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
 
     // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
     Res = DAG.getBitcast(MVT::v4i64, Res);
     Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
 
     if (DstVT.is256BitVector())
       return DAG.getBitcast(DstVT, Res);
 
     // If 512bit -> 128bit truncate another stage.
     EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
     Res = DAG.getBitcast(PackedVT, Res);
     return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
   }
 
   // Recursively pack lower/upper subvectors, concat result and pack again.
   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
   EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
 
   PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
   return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
 }
 
 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
 
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
 
   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
 
   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
   if (InVT.getScalarSizeInBits() <= 16) {
     if (Subtarget.hasBWI()) {
       // legal, will go to VPMOVB2M, VPMOVW2M
       if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
         // We need to shift to get the lsb into sign position.
         // Shift packed bytes not supported natively, bitcast to word
         MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
         In = DAG.getNode(ISD::SHL, DL, ExtVT,
                          DAG.getBitcast(ExtVT, In),
                          DAG.getConstant(ShiftInx, DL, ExtVT));
         In = DAG.getBitcast(InVT, In);
       }
       return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
                           In, ISD::SETGT);
     }
     // Use TESTD/Q, extended vector to packed dword/qword.
     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
            "Unexpected vector type.");
     unsigned NumElts = InVT.getVectorNumElements();
     assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
     // We need to change to a wider element type that we have support for.
     // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
     // For 16 element vectors we extend to v16i32 unless we are explicitly
     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
     // we need to split into two 8 element vectors which we can extend to v8i32,
     // truncate and concat the results. There's an additional complication if
     // the original type is v16i8. In that case we can't split the v16i8 so
     // first we pre-extend it to v16i16 which we can split to v8i16, then extend
     // to v8i32, truncate that to v8i1 and concat the two halves.
     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
       if (InVT == MVT::v16i8) {
         // First we need to sign extend up to 256-bits so we can split that.
         InVT = MVT::v16i16;
         In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
       }
       SDValue Lo = extract128BitVector(In, 0, DAG, DL);
       SDValue Hi = extract128BitVector(In, 8, DAG, DL);
       // We're split now, just emit two truncates and a concat. The two
       // truncates will trigger legalization to come back to this function.
       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
       Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
     }
     // We either have 8 elements or we're allowed to use 512-bit vectors.
     // If we have VLX, we want to use the narrowest vector that can get the
     // job done so we use vXi32.
     MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
     MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
     InVT = ExtVT;
     ShiftInx = InVT.getScalarSizeInBits() - 1;
   }
 
   if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
     // We need to shift to get the lsb into sign position.
     In = DAG.getNode(ISD::SHL, DL, InVT, In,
                      DAG.getConstant(ShiftInx, DL, InVT));
   }
   // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
   if (Subtarget.hasDQI())
     return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
   return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
 }
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
   unsigned InNumEltBits = InVT.getScalarSizeInBits();
 
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
          "Invalid TRUNCATE operation");
 
   // If called by the legalizer just return.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
     return SDValue();
 
   if (VT.getVectorElementType() == MVT::i1)
     return LowerTruncateVecI1(Op, DAG, Subtarget);
 
   // vpmovqb/w/d, vpmovdb/w, vpmovwb
   if (Subtarget.hasAVX512()) {
     // word to byte only under BWI. Otherwise we have to promoted to v16i32
     // and then truncate that. But we should only do that if we haven't been
     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
     // handled by isel patterns.
     if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
         Subtarget.canExtendTo512DQ())
       return Op;
   }
 
   unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Truncate with PACKUS if we are truncating a vector with leading zero bits
   // that extend all the way to the packed/truncated value.
   // Pre-SSE41 we can only use PACKUSWB.
   KnownBits Known = DAG.computeKnownBits(In);
   if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
     if (SDValue V =
             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
       return V;
 
   // Truncate with PACKSS if we are truncating a vector with sign-bits that
   // extend all the way to the packed/truncated value.
   if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
     if (SDValue V =
             truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
       return V;
 
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
       In = DAG.getBitcast(MVT::v8i32, In);
       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
                          DAG.getIntPtrConstant(0, DL));
     }
 
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                DAG.getIntPtrConstant(0, DL));
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                                DAG.getIntPtrConstant(2, DL));
     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
     static const int ShufMask[] = {0, 2, 4, 6};
     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
   }
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
       In = DAG.getBitcast(MVT::v32i8, In);
 
       // The PSHUFB mask:
       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
                                       -1, -1, -1, -1, -1, -1, -1, -1,
                                       16, 17, 20, 21, 24, 25, 28, 29,
                                       -1, -1, -1, -1, -1, -1, -1, -1 };
       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
 
       static const int ShufMask2[] = {0,  2,  -1,  -1};
       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                        DAG.getIntPtrConstant(0, DL));
       return DAG.getBitcast(VT, In);
     }
 
     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
                                DAG.getIntPtrConstant(0, DL));
 
     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
                                DAG.getIntPtrConstant(4, DL));
 
     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
 
     // The PSHUFB mask:
     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                    -1, -1, -1, -1, -1, -1, -1, -1};
 
     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
 
     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
 
     // The MOVLHPS Mask:
     static const int ShufMask2[] = {0, 1, 4, 5};
     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
     return DAG.getBitcast(MVT::v8i16, res);
   }
 
   if (VT == MVT::v16i8 && InVT == MVT::v16i16) {
     // Use an AND to zero uppper bits for PACKUS.
     In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));
 
     SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
                                DAG.getIntPtrConstant(0, DL));
     SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,
                                DAG.getIntPtrConstant(8, DL));
     return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);
   }
 
   // Handle truncation of V256 to V128 using shuffles.
   assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
 
   assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
 
   unsigned NumElems = VT.getVectorNumElements();
   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
 
   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
   // Prepare truncation shuffle mask
   for (unsigned i = 0; i != NumElems; ++i)
     MaskVec[i] = i * 2;
   In = DAG.getBitcast(NVT, In);
   SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
                      DAG.getIntPtrConstant(0, DL));
 }
 
 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
   MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) {
     SDValue Src = Op.getOperand(0);
     SDLoc dl(Op);
 
     if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
       MVT ResVT = MVT::v4i32;
       MVT TruncVT = MVT::v4i1;
       unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
       if (!IsSigned && !Subtarget.hasVLX()) {
         // Widen to 512-bits.
         ResVT = MVT::v8i32;
         TruncVT = MVT::v8i1;
         Opc = ISD::FP_TO_UINT;
         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
                           DAG.getUNDEF(MVT::v8f64),
                           Src, DAG.getIntPtrConstant(0, dl));
       }
       SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
       Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
                          DAG.getIntPtrConstant(0, dl));
     }
 
     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
       return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                      DAG.getUNDEF(MVT::v2f32)));
     }
 
     return SDValue();
   }
 
   assert(!VT.isVector());
 
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
     IsSigned, /*IsReplace=*/ false);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
   if (!FIST.getNode())
     return Op;
 
   if (StackSlot.getNode())
     // Load the result.
     return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
 
   // The node is the result.
   return FIST;
 }
 
 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getSimpleValueType();
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
                                  In, DAG.getUNDEF(SVT)));
 }
 
 /// Horizontal vector math instructions may be slower than normal math with
 /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
 /// implementation, and likely shuffle complexity of the alternate sequence.
 static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
   bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize();
   bool HasFastHOps = Subtarget.hasFastHorizontalOps();
   return !IsSingleSource || IsOptimizingSize || HasFastHOps;
 }
 
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
 /// vector operation in place of the typical scalar operation.
 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   // If both operands have other uses, this is probably not profitable.
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   if (!LHS.hasOneUse() && !RHS.hasOneUse())
     return Op;
 
   // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
   bool IsFP = Op.getSimpleValueType().isFloatingPoint();
   if (IsFP && !Subtarget.hasSSE3())
     return Op;
   if (!IsFP && !Subtarget.hasSSSE3())
     return Op;
 
   // Defer forming the minimal horizontal op if the vector source has more than
   // the 2 extract element uses that we're matching here. In that case, we might
   // form a horizontal op that includes more than 1 add/sub op.
   if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
       RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
       LHS.getOperand(0) != RHS.getOperand(0) ||
       !LHS.getOperand(0)->hasNUsesOfValue(2, 0))
     return Op;
 
   if (!isa<ConstantSDNode>(LHS.getOperand(1)) ||
       !isa<ConstantSDNode>(RHS.getOperand(1)) ||
       !shouldUseHorizontalOp(true, DAG, Subtarget))
     return Op;
 
   // Allow commuted 'hadd' ops.
   // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
   unsigned HOpcode;
   switch (Op.getOpcode()) {
     case ISD::ADD: HOpcode = X86ISD::HADD; break;
     case ISD::SUB: HOpcode = X86ISD::HSUB; break;
     case ISD::FADD: HOpcode = X86ISD::FHADD; break;
     case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
     default:
       llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
   }
   unsigned LExtIndex = LHS.getConstantOperandVal(1);
   unsigned RExtIndex = RHS.getConstantOperandVal(1);
   if (LExtIndex == 1 && RExtIndex == 0 &&
       (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
     std::swap(LExtIndex, RExtIndex);
 
   // TODO: This can be extended to handle other adjacent extract pairs.
   if (LExtIndex != 0 || RExtIndex != 1)
     return Op;
 
   SDValue X = LHS.getOperand(0);
   EVT VecVT = X.getValueType();
   unsigned BitWidth = VecVT.getSizeInBits();
   assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
          "Not expecting illegal vector widths here");
 
   // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
   // equivalent, so extract the 256/512-bit source op to 128-bit.
   // This is free: ymm/zmm -> xmm.
   SDLoc DL(Op);
   if (BitWidth == 256 || BitWidth == 512)
     X = extract128BitVector(X, 0, DAG, DL);
 
   // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
   // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
   // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
   SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
                      DAG.getIntPtrConstant(0, DL));
 }
 
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
 /// vector operation in place of the typical scalar operation.
 static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG,
                              const X86Subtarget &Subtarget) {
   assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
          "Only expecting float/double");
   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
 }
 
 /// The only differences between FABS and FNEG are the mask and the logic op.
 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
          "Wrong opcode for lowering FABS or FNEG.");
 
   bool IsFABS = (Op.getOpcode() == ISD::FABS);
 
   // If this is a FABS and it has an FNEG user, bail out to fold the combination
   // into an FNABS. We'll lower the FABS after that if it is still in use.
   if (IsFABS)
     for (SDNode *User : Op->uses())
       if (User->getOpcode() == ISD::FNEG)
         return Op;
 
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   bool IsF128 = (VT == MVT::f128);
   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
          "Unexpected type in LowerFABSorFNEG");
 
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   // decide if we should generate a 16-byte constant mask when we only need 4 or
   // 8 bytes for the scalar case.
 
   // There are no scalar bitwise logical SSE/AVX instructions, so we
   // generate a 16-byte vector constant and logic op even for the scalar case.
   // Using a 16-byte mask allows folding the load of the mask with
   // the logic op, so it can save (~4 bytes) on code size.
   bool IsFakeVector = !VT.isVector() && !IsF128;
   MVT LogicVT = VT;
   if (IsFakeVector)
     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
 
   unsigned EltBits = VT.getScalarSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
   APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
                            APInt::getSignMask(EltBits);
   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
 
   SDValue Op0 = Op.getOperand(0);
   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
   unsigned LogicOp = IsFABS  ? X86ISD::FAND :
                      IsFNABS ? X86ISD::FOR  :
                                X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
 
   if (VT.isVector() || IsF128)
     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
 
   // For the scalar case extend to a 128-bit vector, perform the logic op,
   // and extract the scalar result back out.
   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
                      DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue Mag = Op.getOperand(0);
   SDValue Sign = Op.getOperand(1);
   SDLoc dl(Op);
 
   // If the sign operand is smaller, extend it first.
   MVT VT = Op.getSimpleValueType();
   if (Sign.getSimpleValueType().bitsLT(VT))
     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
 
   // And if it is bigger, shrink it first.
   if (Sign.getSimpleValueType().bitsGT(VT))
     Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
 
   // At this point the operands and the result should have the same
   // type, and that won't be f80 since that is not custom lowered.
   bool IsF128 = (VT == MVT::f128);
   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
          "Unexpected type in LowerFCOPYSIGN");
 
   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
 
   // Perform all scalar logic operations as 16-byte vectors because there are no
   // scalar FP logic instructions in SSE.
   // TODO: This isn't necessary. If we used scalar types, we might avoid some
   // unnecessary splats, but we might miss load folding opportunities. Should
   // this decision be based on OptimizeForSize?
   bool IsFakeVector = !VT.isVector() && !IsF128;
   MVT LogicVT = VT;
   if (IsFakeVector)
     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
 
   // The mask constants are automatically splatted for vector types.
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   SDValue SignMask = DAG.getConstantFP(
       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
   SDValue MagMask = DAG.getConstantFP(
       APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
 
   // First, clear all bits but the sign bit from the second operand (sign).
   if (IsFakeVector)
     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
 
   // Next, clear the sign bit from the first operand (magnitude).
   // TODO: If we had general constant folding for FP logic ops, this check
   // wouldn't be necessary.
   SDValue MagBits;
   if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
     APFloat APF = Op0CN->getValueAPF();
     APF.clearSign();
     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
   } else {
     // If the magnitude operand wasn't a constant, we need to AND out the sign.
     if (IsFakeVector)
       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
   }
 
   // OR the magnitude value with the sign bit.
   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
                                           DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   MVT OpVT = N0.getSimpleValueType();
   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
          "Unexpected type for FGETSIGN");
 
   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
   Res = DAG.getZExtOrTrunc(Res, dl, VT);
   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
   return Res;
 }
 
 /// Helper for creating a X86ISD::SETCC node.
 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
                         SelectionDAG &DAG) {
   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                      DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
 }
 
 // Check whether an OR'd tree is PTEST-able.
 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG,
                                       SDValue &X86CC) {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
 
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   if (!Op->hasOneUse())
     return SDValue();
 
   SDNode *N = Op.getNode();
   SDLoc DL(N);
 
   SmallVector<SDValue, 8> Opnds;
   DenseMap<SDValue, unsigned> VecInMap;
   SmallVector<SDValue, 8> VecIns;
   EVT VT = MVT::Other;
 
   // Recognize a special case where a vector is casted into wide integer to
   // test all 0s.
   Opnds.push_back(N->getOperand(0));
   Opnds.push_back(N->getOperand(1));
 
   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
     // BFS traverse all OR'd operands.
     if (I->getOpcode() == ISD::OR) {
       Opnds.push_back(I->getOperand(0));
       Opnds.push_back(I->getOperand(1));
       // Re-evaluate the number of nodes to be traversed.
       e += 2; // 2 more nodes (LHS and RHS) are pushed.
       continue;
     }
 
     // Quit if a non-EXTRACT_VECTOR_ELT
     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
 
     // Quit if without a constant index.
     SDValue Idx = I->getOperand(1);
     if (!isa<ConstantSDNode>(Idx))
       return SDValue();
 
     SDValue ExtractedFromVec = I->getOperand(0);
     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
     if (M == VecInMap.end()) {
       VT = ExtractedFromVec.getValueType();
       // Quit if not 128/256-bit vector.
       if (!VT.is128BitVector() && !VT.is256BitVector())
         return SDValue();
       // Quit if not the same type.
       if (VecInMap.begin() != VecInMap.end() &&
           VT != VecInMap.begin()->first.getValueType())
         return SDValue();
       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
       VecIns.push_back(ExtractedFromVec);
     }
     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   }
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Not extracted from 128-/256-bit vector.");
 
   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
 
   for (DenseMap<SDValue, unsigned>::const_iterator
         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
     // Quit if not all elements are used.
     if (I->second != FullMask)
       return SDValue();
   }
 
   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
 
   // Cast all vectors into TestVT for PTEST.
   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
 
   // If more than one full vector is evaluated, OR them first before PTEST.
   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
     // Each iteration will OR 2 nodes and append the result until there is only
     // 1 node left, i.e. the final OR'd value of all vectors.
     SDValue LHS = VecIns[Slot];
     SDValue RHS = VecIns[Slot + 1];
     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   }
 
   X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
                           DL, MVT::i8);
   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
                      VecIns.back(), VecIns.back());
 }
 
 /// return true if \c Op has a use that doesn't just read flags.
 static bool hasNonFlagsUse(SDValue Op) {
   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
        ++UI) {
     SDNode *User = *UI;
     unsigned UOpNo = UI.getOperandNo();
     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
       // Look pass truncate.
       UOpNo = User->use_begin().getOperandNo();
       User = *User->use_begin();
     }
 
     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
       return true;
   }
   return false;
 }
 
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
                         SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   // CF and OF aren't always set the way we want. Determine which
   // of these we need.
   bool NeedCF = false;
   bool NeedOF = false;
   switch (X86CC) {
   default: break;
   case X86::COND_A: case X86::COND_AE:
   case X86::COND_B: case X86::COND_BE:
     NeedCF = true;
     break;
   case X86::COND_G: case X86::COND_GE:
   case X86::COND_L: case X86::COND_LE:
   case X86::COND_O: case X86::COND_NO: {
     // Check if we really need to set the
     // Overflow flag. If NoSignedWrap is present
     // that is not actually needed.
     switch (Op->getOpcode()) {
     case ISD::ADD:
     case ISD::SUB:
     case ISD::MUL:
     case ISD::SHL:
       if (Op.getNode()->getFlags().hasNoSignedWrap())
         break;
       LLVM_FALLTHROUGH;
     default:
       NeedOF = true;
       break;
     }
     break;
   }
   }
   // See if we can use the EFLAGS value from the operand instead of
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
   }
   unsigned Opcode = 0;
   unsigned NumOperands = 0;
 
   SDValue ArithOp = Op;
 
   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
   // which may be the result of a CAST.  We use the variable 'Op', which is the
   // non-casted variable when we check for possible users.
   switch (ArithOp.getOpcode()) {
   case ISD::AND:
     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
     // because a TEST instruction will be better.
     if (!hasNonFlagsUse(Op))
       break;
 
     LLVM_FALLTHROUGH;
   case ISD::ADD:
   case ISD::SUB:
   case ISD::OR:
   case ISD::XOR:
     // Transform to an x86-specific ALU node with flags if there is a chance of
     // using an RMW op or only the flags are used. Otherwise, leave
     // the node alone and emit a 'test' instruction.
     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
            UE = Op.getNode()->use_end(); UI != UE; ++UI)
       if (UI->getOpcode() != ISD::CopyToReg &&
           UI->getOpcode() != ISD::SETCC &&
           UI->getOpcode() != ISD::STORE)
         goto default_case;
 
     // Otherwise use a regular EFLAGS-setting instruction.
     switch (ArithOp.getOpcode()) {
     default: llvm_unreachable("unexpected operator!");
     case ISD::ADD: Opcode = X86ISD::ADD; break;
     case ISD::SUB: Opcode = X86ISD::SUB; break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR:  Opcode = X86ISD::OR;  break;
     }
 
     NumOperands = 2;
     break;
   case X86ISD::ADD:
   case X86ISD::SUB:
   case X86ISD::OR:
   case X86ISD::XOR:
   case X86ISD::AND:
     return SDValue(Op.getNode(), 1);
   default:
   default_case:
     break;
   }
 
   if (Opcode == 0) {
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, Op.getValueType()));
   }
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
 
   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
   return SDValue(New.getNode(), 1);
 }
 
 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                                    const SDLoc &dl, SelectionDAG &DAG) const {
   if (isNullConstant(Op1))
     return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
 
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
     // Only promote the compare up to I32 if it is a 16 bit operation
     // with an immediate.  16 bit immediates are to be avoided.
     if (Op0.getValueType() == MVT::i16 &&
         ((isa<ConstantSDNode>(Op0) &&
           !cast<ConstantSDNode>(Op0)->getAPIntValue().isSignedIntN(8)) ||
          (isa<ConstantSDNode>(Op1) &&
           !cast<ConstantSDNode>(Op1)->getAPIntValue().isSignedIntN(8))) &&
         !DAG.getMachineFunction().getFunction().optForMinSize() &&
         !Subtarget.isAtom()) {
       unsigned ExtendOp =
           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
     }
     // Use SUB instead of CMP to enable CSE between SUB and CMP.
     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
     return SDValue(Sub.getNode(), 1);
   }
   assert(Op0.getValueType().isFloatingPoint() && "Unexpected VT!");
   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
 }
 
 /// Convert a comparison if required by the subtarget.
 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
                                                  SelectionDAG &DAG) const {
   // If the subtarget does not support the FUCOMI instruction, floating-point
   // comparisons have to be converted.
   if (Subtarget.hasCMov() ||
       Cmp.getOpcode() != X86ISD::CMP ||
       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
       !Cmp.getOperand(1).getValueType().isFloatingPoint())
     return Cmp;
 
   // The instruction selector will select an FUCOM instruction instead of
   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
   SDLoc dl(Cmp);
   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
                             DAG.getConstant(8, dl, MVT::i8));
   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
 
   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
 }
 
 /// Check if replacement of SQRT with RSQRT should be disabled.
 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
   // We never want to use both SQRT and RSQRT instructions for the same input.
   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
     return false;
 
   if (VT.isVector())
     return Subtarget.hasFastVectorFSQRT();
   return Subtarget.hasFastScalarFSQRT();
 }
 
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
                                            SelectionDAG &DAG, int Enabled,
                                            int &RefinementSteps,
                                            bool &UseOneConstNR,
                                            bool Reciprocal) const {
   EVT VT = Op.getValueType();
 
   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
   // It is likely not profitable to do this for f64 because a double-precision
   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
   // instructions: convert to single, rsqrtss, convert back to double, refine
   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
   // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
   // after legalize types.
   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = 1;
 
     UseOneConstNR = false;
     // There is no FSQRT for 512-bits, but there is RSQRT14.
     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
   }
   return SDValue();
 }
 
 /// The minimum architected relative accuracy is 2^-12. We need one
 /// Newton-Raphson step to have a good float result (24 bits of precision).
 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
                                             int Enabled,
                                             int &RefinementSteps) const {
   EVT VT = Op.getValueType();
 
   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
   // It is likely not profitable to do this for f64 because a double-precision
   // reciprocal estimate with refinement on x86 prior to FMA requires
   // 15 instructions: convert to single, rcpss, convert back to double, refine
   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
   // along with FMA, this could be a throughput win.
 
   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
       (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
       (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
     // Enable estimate codegen with 1 refinement step for vector division.
     // Scalar division estimates are disabled because they break too much
     // real-world code. These defaults are intended to match GCC behavior.
     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
       return SDValue();
 
     if (RefinementSteps == ReciprocalEstimate::Unspecified)
       RefinementSteps = 1;
 
     // There is no FSQRT for 512-bits, but there is RCP14.
     unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
     return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
   }
   return SDValue();
 }
 
 /// If we have at least two divisions that use the same divisor, convert to
 /// multiplication by a reciprocal. This may need to be adjusted for a given
 /// CPU if a division's cost is not at least twice the cost of a multiplication.
 /// This is because we still need one division to calculate the reciprocal and
 /// then we need two multiplies by that reciprocal as replacements for the
 /// original divisions.
 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
   return 2;
 }
 
 /// Result of 'and' is compared against zero. Change to a BT node if possible.
 /// Returns the BT node and the condition code needed to use it.
 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
                             const SDLoc &dl, SelectionDAG &DAG,
                             SDValue &X86CC) {
   assert(And.getOpcode() == ISD::AND && "Expected AND node!");
   SDValue Op0 = And.getOperand(0);
   SDValue Op1 = And.getOperand(1);
   if (Op0.getOpcode() == ISD::TRUNCATE)
     Op0 = Op0.getOperand(0);
   if (Op1.getOpcode() == ISD::TRUNCATE)
     Op1 = Op1.getOperand(0);
 
   SDValue Src, BitNo;
   if (Op1.getOpcode() == ISD::SHL)
     std::swap(Op0, Op1);
   if (Op0.getOpcode() == ISD::SHL) {
     if (isOneConstant(Op0.getOperand(0))) {
       // If we looked past a truncate, check that it's only truncating away
       // known zeros.
       unsigned BitWidth = Op0.getValueSizeInBits();
       unsigned AndBitWidth = And.getValueSizeInBits();
       if (BitWidth > AndBitWidth) {
         KnownBits Known = DAG.computeKnownBits(Op0);
         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
           return SDValue();
       }
       Src = Op1;
       BitNo = Op0.getOperand(1);
     }
   } else if (Op1.getOpcode() == ISD::Constant) {
     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
     uint64_t AndRHSVal = AndRHS->getZExtValue();
     SDValue AndLHS = Op0;
 
     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
       Src = AndLHS.getOperand(0);
       BitNo = AndLHS.getOperand(1);
     } else {
       // Use BT if the immediate can't be encoded in a TEST instruction or we
       // are optimizing for size and the immedaite won't fit in a byte.
       bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
       if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
           isPowerOf2_64(AndRHSVal)) {
         Src = AndLHS;
         BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
                                 Src.getValueType());
       }
     }
   }
 
   // No patterns found, give up.
   if (!Src.getNode())
     return SDValue();
 
   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
   // instruction.  Since the shift amount is in-range-or-undefined, we know
   // that doing a bittest on the i32 value is ok.  We extend to i32 because
   // the encoding for the i16 version is larger than the i32 version.
   // Also promote i16 to i32 for performance / code size reason.
   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
 
   // See if we can use the 32-bit instruction instead of the 64-bit one for a
   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
   // known to be zero.
   if (Src.getValueType() == MVT::i64 &&
       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
 
   // If the operand types disagree, extend the shift amount to match.  Since
   // BT ignores high bits (like shifts) we can use anyextend.
   if (Src.getValueType() != BitNo.getValueType())
     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
 
   X86CC = DAG.getConstant(CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B,
                           dl, MVT::i8);
   return DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
 }
 
 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
 /// CMPs.
 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
                                    SDValue &Op1) {
   unsigned SSECC;
   bool Swap = false;
 
   // SSE Condition code mapping:
   //  0 - EQ
   //  1 - LT
   //  2 - LE
   //  3 - UNORD
   //  4 - NEQ
   //  5 - NLT
   //  6 - NLE
   //  7 - ORD
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETOEQ:
   case ISD::SETEQ:  SSECC = 0; break;
   case ISD::SETOGT:
   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETLT:
   case ISD::SETOLT: SSECC = 1; break;
   case ISD::SETOGE:
   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETLE:
   case ISD::SETOLE: SSECC = 2; break;
   case ISD::SETUO:  SSECC = 3; break;
   case ISD::SETUNE:
   case ISD::SETNE:  SSECC = 4; break;
   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETUGE: SSECC = 5; break;
   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
   case ISD::SETUGT: SSECC = 6; break;
   case ISD::SETO:   SSECC = 7; break;
   case ISD::SETUEQ: SSECC = 8; break;
   case ISD::SETONE: SSECC = 12; break;
   }
   if (Swap)
     std::swap(Op0, Op1);
 
   return SSECC;
 }
 
 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
 /// concatenate the result back.
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
   SDLoc dl(Op);
   SDValue CC = Op.getOperand(2);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
 
   // Issue the operation on the smaller types and concatenate the result back
   MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
 }
 
 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
 
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   assert(VT.getVectorElementType() == MVT::i1 &&
          "Cannot set masked compare for this operation");
 
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
 
   // If this is a seteq make sure any build vectors of all zeros are on the RHS.
   // This helps with vptestm matching.
   // TODO: Should we just canonicalize the setcc during DAG combine?
   if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
       ISD::isBuildVectorAllZeros(Op0.getNode()))
     std::swap(Op0, Op1);
 
   // Prefer SETGT over SETLT.
   if (SetCCOpcode == ISD::SETLT) {
     SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
     std::swap(Op0, Op1);
   }
 
   return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
 }
 
 /// Given a simple buildvector constant, return a new vector constant with each
 /// element decremented. If decrementing would result in underflow or this
 /// is not a simple vector constant, return an empty value.
 static SDValue decrementVectorConstant(SDValue V, SelectionDAG &DAG) {
   auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
   if (!BV)
     return SDValue();
 
   MVT VT = V.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   SmallVector<SDValue, 8> NewVecC;
   SDLoc DL(V);
   for (unsigned i = 0; i < NumElts; ++i) {
     auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
       return SDValue();
 
     // Avoid underflow.
     if (Elt->getAPIntValue().isNullValue())
       return SDValue();
 
     NewVecC.push_back(DAG.getConstant(Elt->getAPIntValue() - 1, DL, EltVT));
   }
 
   return DAG.getBuildVector(VT, DL, NewVecC);
 }
 
 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
 /// Op0 u<= Op1:
 ///   t = psubus Op0, Op1
 ///   pcmpeq t, <0..0>
 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
                                     ISD::CondCode Cond, const SDLoc &dl,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   MVT VET = VT.getVectorElementType();
   if (VET != MVT::i8 && VET != MVT::i16)
     return SDValue();
 
   switch (Cond) {
   default:
     return SDValue();
   case ISD::SETULT: {
     // If the comparison is against a constant we can turn this into a
     // setule.  With psubus, setule does not require a swap.  This is
     // beneficial because the constant in the register is no longer
     // destructed as the destination so it can be hoisted out of a loop.
     // Only do this pre-AVX since vpcmp* is no longer destructive.
     if (Subtarget.hasAVX())
       return SDValue();
     SDValue ULEOp1 = decrementVectorConstant(Op1, DAG);
     if (!ULEOp1)
       return SDValue();
     Op1 = ULEOp1;
     break;
   }
   // Psubus is better than flip-sign because it requires no inversion.
   case ISD::SETUGE:
     std::swap(Op0, Op1);
     break;
   case ISD::SETULE:
     break;
   }
 
   SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
   return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
                      DAG.getConstant(0, dl, VT));
 }
 
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   MVT VT = Op.getSimpleValueType();
   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
   SDLoc dl(Op);
 
   if (isFP) {
 #ifndef NDEBUG
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
     unsigned Opc;
     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
       assert(VT.getVectorNumElements() <= 16);
       Opc = X86ISD::CMPM;
     } else {
       Opc = X86ISD::CMPP;
       // The SSE/AVX packed FP comparison nodes are defined with a
       // floating-point vector result that matches the operand type. This allows
       // them to work with an SSE1 target (integer vector types are not legal).
       VT = Op0.getSimpleValueType();
     }
 
     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
     // emit two comparisons and a logic op to tie them together.
     SDValue Cmp;
     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
     if (SSECC >= 8 && !Subtarget.hasAVX()) {
       // LLVM predicate is SETUEQ or SETONE.
       unsigned CC0, CC1;
       unsigned CombineOpc;
       if (Cond == ISD::SETUEQ) {
         CC0 = 3; // UNORD
         CC1 = 0; // EQ
         CombineOpc = X86ISD::FOR;
       } else {
         assert(Cond == ISD::SETONE);
         CC0 = 7; // ORD
         CC1 = 4; // NEQ
         CombineOpc = X86ISD::FAND;
       }
 
       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC0, dl, MVT::i8));
       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC1, dl, MVT::i8));
       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
     } else {
       // Handle all other FP comparisons here.
       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
                         DAG.getConstant(SSECC, dl, MVT::i8));
     }
 
     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
     // result type of SETCC. The bitcast is expected to be optimized away
     // during combining/isel.
     if (Opc == X86ISD::CMPP)
       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
 
     return Cmp;
   }
 
   MVT VTOp0 = Op0.getSimpleValueType();
   assert(VTOp0 == Op1.getSimpleValueType() &&
          "Expected operands with same type!");
   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
          "Invalid number of packed elements for source and destination!");
 
   // This is being called by type legalization because v2i32 is marked custom
   // for result type legalization for v2f32.
   if (VTOp0 == MVT::v2i32)
     return SDValue();
 
   // The non-AVX512 code below works under the assumption that source and
   // destination types are the same.
   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
          "Value types for source and destination must be the same!");
 
   // Break 256-bit integer vector compare into smaller ones.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntVSETCC(Op, DAG);
 
   // The result is boolean, but operands are int/float
   if (VT.getVectorElementType() == MVT::i1) {
     // In AVX-512 architecture setcc returns mask with i1 elements,
     // But there is no compare instruction for i8 and i16 elements in KNL.
     assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
            "Unexpected operand type");
     return LowerIntVSETCC_AVX512(Op, DAG);
   }
 
   // Lower using XOP integer comparisons.
   if (VT.is128BitVector() && Subtarget.hasXOP()) {
     // Translate compare code to XOP PCOM compare mode.
     unsigned CmpMode = 0;
     switch (Cond) {
     default: llvm_unreachable("Unexpected SETCC condition");
     case ISD::SETULT:
     case ISD::SETLT: CmpMode = 0x00; break;
     case ISD::SETULE:
     case ISD::SETLE: CmpMode = 0x01; break;
     case ISD::SETUGT:
     case ISD::SETGT: CmpMode = 0x02; break;
     case ISD::SETUGE:
     case ISD::SETGE: CmpMode = 0x03; break;
     case ISD::SETEQ: CmpMode = 0x04; break;
     case ISD::SETNE: CmpMode = 0x05; break;
     }
 
     // Are we comparing unsigned or signed integers?
     unsigned Opc =
         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
 
     return DAG.getNode(Opc, dl, VT, Op0, Op1,
                        DAG.getConstant(CmpMode, dl, MVT::i8));
   }
 
   // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
   // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
   if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
     SDValue BC0 = peekThroughBitcasts(Op0);
     if (BC0.getOpcode() == ISD::AND) {
       APInt UndefElts;
       SmallVector<APInt, 64> EltBits;
       if (getTargetConstantBitsFromNode(BC0.getOperand(1),
                                         VT.getScalarSizeInBits(), UndefElts,
                                         EltBits, false, false)) {
         if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
           Cond = ISD::SETEQ;
           Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
         }
       }
     }
   }
 
   // If this is a SETNE against the signed minimum value, change it to SETGT.
   // If this is a SETNE against the signed maximum value, change it to SETLT.
   // which will be swapped to SETGT.
   // Otherwise we use PCMPEQ+invert.
   APInt ConstValue;
   if (Cond == ISD::SETNE &&
       ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
     if (ConstValue.isMinSignedValue())
       Cond = ISD::SETGT;
     else if (ConstValue.isMaxSignedValue())
       Cond = ISD::SETLT;
   }
 
   // If both operands are known non-negative, then an unsigned compare is the
   // same as a signed compare and there's no need to flip signbits.
   // TODO: We could check for more general simplifications here since we're
   // computing known bits.
   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
 
   // Special case: Use min/max operations for unsigned compares.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (ISD::isUnsignedIntSetCC(Cond) &&
       (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
       TLI.isOperationLegal(ISD::UMIN, VT)) {
     // If we have a constant operand, increment/decrement it and change the
     // condition to avoid an invert.
     // TODO: This could be extended to handle a non-splat constant by checking
     // that each element of the constant is not the max/null value.
     APInt C;
     if (Cond == ISD::SETUGT && isConstantSplat(Op1, C) && !C.isMaxValue()) {
       // X > C --> X >= (C+1) --> X == umax(X, C+1)
       Op1 = DAG.getConstant(C + 1, dl, VT);
       Cond = ISD::SETUGE;
     }
     if (Cond == ISD::SETULT && isConstantSplat(Op1, C) && !C.isNullValue()) {
       // X < C --> X <= (C-1) --> X == umin(X, C-1)
       Op1 = DAG.getConstant(C - 1, dl, VT);
       Cond = ISD::SETULE;
     }
     bool Invert = false;
     unsigned Opc;
     switch (Cond) {
     default: llvm_unreachable("Unexpected condition code");
     case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
     case ISD::SETULE: Opc = ISD::UMIN; break;
     case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
     case ISD::SETUGE: Opc = ISD::UMAX; break;
     }
 
     SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
 
     // If the logical-not of the result is required, perform that now.
     if (Invert)
       Result = DAG.getNOT(dl, Result, VT);
 
     return Result;
   }
 
   // Try to use SUBUS and PCMPEQ.
   if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
     return V;
 
   // We are handling one of the integer comparisons here. Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
                                                             : X86ISD::PCMPGT;
   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
               Cond == ISD::SETGE || Cond == ISD::SETUGE;
   bool Invert = Cond == ISD::SETNE ||
                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
 
   if (Swap)
     std::swap(Op0, Op1);
 
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
   if (VT == MVT::v2i64) {
     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
 
       // Since SSE has no unsigned integer comparisons, we need to flip the sign
       // bits of the inputs before performing those operations. The lower
       // compare is always unsigned.
       SDValue SB;
       if (FlipSigns) {
         SB = DAG.getConstant(0x8000000080000000ULL, dl, MVT::v2i64);
       } else {
         SB = DAG.getConstant(0x0000000080000000ULL, dl, MVT::v2i64);
       }
       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
 
       // Cast everything to the right type.
       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
 
       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
 
       // Create masks for only the low parts/high parts of the 64 bit integers.
       static const int MaskHi[] = { 1, 1, 3, 3 };
       static const int MaskLo[] = { 0, 0, 2, 2 };
       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
 
       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
 
       if (Invert)
         Result = DAG.getNOT(dl, Result, MVT::v4i32);
 
       return DAG.getBitcast(VT, Result);
     }
 
     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
       // pcmpeqd + pshufd + pand.
       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
 
       // First cast everything to the right type.
       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
 
       // Do the compare.
       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
 
       // Make sure the lower and upper halves are both all-ones.
       static const int Mask[] = { 1, 0, 3, 2 };
       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
 
       if (Invert)
         Result = DAG.getNOT(dl, Result, MVT::v4i32);
 
       return DAG.getBitcast(VT, Result);
     }
   }
 
   // Since SSE has no unsigned integer comparisons, we need to flip the sign
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
     MVT EltVT = VT.getVectorElementType();
     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
                                  VT);
     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
   }
 
   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
 
   // If the logical-not of the result is required, perform that now.
   if (Invert)
     Result = DAG.getNOT(dl, Result, VT);
 
   return Result;
 }
 
 // Try to select this as a KORTEST+SETCC if possible.
 static SDValue EmitKORTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
                            const SDLoc &dl, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget,
                            SDValue &X86CC) {
   // Only support equality comparisons.
   if (CC != ISD::SETEQ && CC != ISD::SETNE)
     return SDValue();
 
   // Must be a bitcast from vXi1.
   if (Op0.getOpcode() != ISD::BITCAST)
     return SDValue();
 
   Op0 = Op0.getOperand(0);
   MVT VT = Op0.getSimpleValueType();
   if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
       !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
       !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
     return SDValue();
 
   X86::CondCode X86Cond;
   if (isNullConstant(Op1)) {
     X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
   } else if (isAllOnesConstant(Op1)) {
     // C flag is set for all ones.
     X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
   } else
     return SDValue();
 
   // If the input is an OR, we can combine it's operands into the KORTEST.
   SDValue LHS = Op0;
   SDValue RHS = Op0;
   if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
     LHS = Op0.getOperand(0);
     RHS = Op0.getOperand(1);
   }
 
   X86CC = DAG.getConstant(X86Cond, dl, MVT::i8);
   return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
 }
 
 /// Emit flags for the given setcc condition and operands. Also returns the
 /// corresponding X86 condition code constant in X86CC.
 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
                                              ISD::CondCode CC, const SDLoc &dl,
                                              SelectionDAG &DAG,
                                              SDValue &X86CC) const {
   // Optimize to BT if possible.
   // Lower (X & (1 << N)) == 0 to BT(X, N).
   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
   if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CC))
       return BT;
   }
 
   // Try to use PTEST for a tree ORs equality compared with 0.
   // TODO: We could do AND tree with all 1s as well by using the C flag.
   if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
       return PTEST;
   }
 
   // Try to lower using KORTEST.
   if (SDValue KORTEST = EmitKORTEST(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
     return KORTEST;
 
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   // these.
   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // If the input is a setcc, then reuse the input setcc or use a new one with
     // the inverted condition.
     if (Op0.getOpcode() == X86ISD::SETCC) {
       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
 
       X86CC = Op0.getOperand(0);
       if (Invert) {
         X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
         CCode = X86::GetOppositeBranchCondition(CCode);
         X86CC = DAG.getConstant(CCode, dl, MVT::i8);
       }
 
       return Op0.getOperand(1);
     }
   }
 
   bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
   X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
   if (CondCode == X86::COND_INVALID)
     return SDValue();
 
   SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   X86CC = DAG.getConstant(CondCode, dl, MVT::i8);
   return EFLAGS;
 }
 
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDLoc dl(Op);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
   SDValue X86CC;
   SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
   if (!EFLAGS)
     return SDValue();
 
   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
 }
 
 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Carry = Op.getOperand(2);
   SDValue Cond = Op.getOperand(3);
   SDLoc DL(Op);
 
   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
 
   // Recreate the carry if needed.
   EVT CarryVT = Carry.getValueType();
   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
 
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
   return getSETCC(CC, Cmp.getValue(1), DL, DAG);
 }
 
 // This function returns three things: the arithmetic computation itself
 // (Value), an EFLAGS result (Overflow), and a condition code (Cond).  The
 // flag and the condition code define the case in which the arithmetic
 // computation overflows.
 static std::pair<SDValue, SDValue>
 getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {
   assert(Op.getResNo() == 0 && "Unexpected result number!");
   SDValue Value, Overflow;
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   unsigned BaseOp = 0;
   SDLoc DL(Op);
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_O;
     break;
   case ISD::UADDO:
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_B;
     break;
   case ISD::SSUBO:
     BaseOp = X86ISD::SUB;
     Cond = X86::COND_O;
     break;
   case ISD::USUBO:
     BaseOp = X86ISD::SUB;
     Cond = X86::COND_B;
     break;
   case ISD::SMULO:
     BaseOp = X86ISD::SMUL;
     Cond = X86::COND_O;
     break;
   case ISD::UMULO:
     BaseOp = X86ISD::UMUL;
     Cond = X86::COND_O;
     break;
   }
 
   if (BaseOp) {
     // Also sets EFLAGS.
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
     Overflow = Value.getValue(1);
   }
 
   return std::make_pair(Value, Overflow);
 }
 
 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   // looks for this combo and may remove the "setcc" instruction if the "setcc"
   // has only one use.
   SDLoc DL(Op);
   X86::CondCode Cond;
   SDValue Value, Overflow;
   std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
 
   SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
   return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
 }
 
 /// Return true if opcode is a X86 logical comparison.
 static bool isX86LogicalCmp(SDValue Op) {
   unsigned Opc = Op.getOpcode();
   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
       Opc == X86ISD::SAHF)
     return true;
   if (Op.getResNo() == 1 &&
       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
        Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
     return true;
 
   return false;
 }
 
 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
     return false;
 
   SDValue VOp0 = V.getOperand(0);
   unsigned InBits = VOp0.getValueSizeInBits();
   unsigned Bits = V.getValueSizeInBits();
   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
 }
 
 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   bool AddTest = true;
   SDValue Cond  = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
   SDLoc DL(Op);
   MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   // are available or VBLENDV if AVX is available.
   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
   if (Cond.getOpcode() == ISD::SETCC &&
       ((Subtarget.hasSSE2() && VT == MVT::f64) ||
        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
     unsigned SSECC = translateX86FSETCC(
         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
 
     if (Subtarget.hasAVX512()) {
       SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
                                 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
       assert(!VT.isVector() && "Not a scalar type?");
       return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
     }
 
     if (SSECC < 8 || Subtarget.hasAVX()) {
       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
                                 DAG.getConstant(SSECC, DL, MVT::i8));
 
       // If we have AVX, we can use a variable vector select (VBLENDV) instead
       // of 3 logic instructions for size savings and potentially speed.
       // Unfortunately, there is no scalar form of VBLENDV.
 
       // If either operand is a +0.0 constant, don't try this. We can expect to
       // optimize away at least one of the logic instructions later in that
       // case, so that sequence would be faster than a variable blend.
 
       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
       // uses XMM0 as the selection register. That may need just as many
       // instructions as the AND/ANDN/OR sequence due to register moves, so
       // don't bother.
       if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
           !isNullFPConstant(Op2)) {
         // Convert to vectors, do a VSELECT, and convert back to scalar.
         // All of the conversions should be optimized away.
         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
 
         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
         VCmp = DAG.getBitcast(VCmpVT, VCmp);
 
         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
 
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
                            VSel, DAG.getIntPtrConstant(0, DL));
       }
       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
     }
   }
 
   // AVX512 fallback is to lower selects of scalar floats to masked moves.
   if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
   }
 
   // For v64i1 without 64-bit support we need to split and rejoin.
   if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
     assert(Subtarget.hasBWI() && "Expected BWI to be legal");
     SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
     SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
     SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
     SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
     SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
     SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   }
 
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
     SDValue Op1Scalar;
     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
       Op1Scalar = Op1.getOperand(0);
     SDValue Op2Scalar;
     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
       Op2Scalar = Op2.getOperand(0);
     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
       SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
                                         Op1Scalar, Op2Scalar);
       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
         return DAG.getBitcast(VT, newSelect);
       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
                          DAG.getIntPtrConstant(0, DL));
     }
   }
 
   if (Cond.getOpcode() == ISD::SETCC) {
     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
       Cond = NewCond;
       // If the condition was updated, it's possible that the operands of the
       // select were also updated (for example, EmitTest has a RAUW). Refresh
       // the local references to the select operands in case they got stale.
       Op1 = Op.getOperand(1);
       Op2 = Op.getOperand(2);
     }
   }
 
   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
   if (Cond.getOpcode() == X86ISD::SETCC &&
       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
       isNullConstant(Cond.getOperand(1).getOperand(1))) {
     SDValue Cmp = Cond.getOperand(1);
     unsigned CondCode =
         cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
 
     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
       SDValue CmpOp0 = Cmp.getOperand(0);
 
       // Apply further optimizations for special cases
       // (select (x != 0), -1, 0) -> neg & sbb
       // (select (x == 0), 0, -1) -> neg & sbb
       if (isNullConstant(Y) &&
           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
         SDValue Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
         SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
         Zero = DAG.getConstant(0, DL, Op.getValueType());
         return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
       }
 
       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
 
       SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
       SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
       SDValue Res =   // Res = 0 or -1.
         DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
 
       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
         Res = DAG.getNOT(DL, Res, Res.getValueType());
 
       if (!isNullConstant(Op2))
         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
       return Res;
     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
                Cmp.getOperand(0).getOpcode() == ISD::AND &&
                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
       SDValue CmpOp0 = Cmp.getOperand(0);
       SDValue Src1, Src2;
       // true if Op2 is XOR or OR operator and one of its operands
       // is equal to Op1
       // ( a , a op b) || ( b , a op b)
       auto isOrXorPattern = [&]() {
         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
           Src1 =
               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
           Src2 = Op1;
           return true;
         }
         return false;
       };
 
       if (isOrXorPattern()) {
         SDValue Neg;
         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
         // we need mask of all zeros or ones with same size of the other
         // operands.
         if (CmpSz > VT.getSizeInBits())
           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
         else if (CmpSz < VT.getSizeInBits())
           Neg = DAG.getNode(ISD::AND, DL, VT,
               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
               DAG.getConstant(1, DL, VT));
         else
           Neg = CmpOp0;
         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                                    Neg); // -(and (x, 0x1))
         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
       }
     }
   }
 
   // Look past (and (setcc_carry (cmp ...)), 1).
   if (Cond.getOpcode() == ISD::AND &&
       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
       isOneConstant(Cond.getOperand(1)))
     Cond = Cond.getOperand(0);
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
   unsigned CondOpcode = Cond.getOpcode();
   if (CondOpcode == X86ISD::SETCC ||
       CondOpcode == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
     MVT VT = Op.getSimpleValueType();
 
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
         !isScalarFPTypeInSSEReg(VT))  // FPStack?
       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
 
     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
         Opc == X86ISD::BT) { // FIXME
       Cond = Cmp;
       AddTest = false;
     }
   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
              CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
     SDValue Value;
     X86::CondCode X86Cond;
     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
 
     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
     AddTest = false;
   }
 
   if (AddTest) {
     // Look past the truncate if the high bits are known zero.
     if (isTruncWithZeroHighBitsInput(Cond, DAG))
       Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
       SDValue BTCC;
       if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, BTCC)) {
         CC = BTCC;
         Cond = BT;
         AddTest = false;
       }
     }
   }
 
   if (AddTest) {
     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
     Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
                    X86::COND_NE, DL, DAG);
   }
 
   // a <  b ? -1 :  0 -> RES = ~setcc_carry
   // a <  b ?  0 : -1 -> RES = setcc_carry
   // a >= b ? -1 :  0 -> RES = setcc_carry
   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   if (Cond.getOpcode() == X86ISD::SUB) {
     Cond = ConvertCmpIfNecessary(Cond, DAG);
     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
 
     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (isNullConstant(Op1) || isNullConstant(Op2))) {
       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
                                 Cond);
       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
         return DAG.getNOT(DL, Res, Res.getValueType());
       return Res;
     }
   }
 
   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
   // widen the cmov and push the truncate through. This avoids introducing a new
   // branch during isel and doesn't add any extensions.
   if (Op.getValueType() == MVT::i8 &&
       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
     if (T1.getValueType() == T2.getValueType() &&
         // Blacklist CopyFromReg to avoid partial register stalls.
         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
                                  CC, Cond);
       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
     }
   }
 
   // Promote i16 cmovs if it won't prevent folding a load.
   if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
     SDValue Ops[] = { Op2, Op1, CC, Cond };
     SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
   }
 
   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
   SDValue Ops[] = { Op2, Op1, CC, Cond };
   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
 }
 
 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
                                      const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
   MVT VTElt = VT.getVectorElementType();
   SDLoc dl(Op);
 
   unsigned NumElts = VT.getVectorNumElements();
 
   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
   MVT ExtVT = VT;
   if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
     // If v16i32 is to be avoided, we'll need to split and concatenate.
     if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
       return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
 
     ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
   }
 
   // Widen to 512-bits if VLX is not supported.
   MVT WideVT = ExtVT;
   if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
     NumElts *= 512 / ExtVT.getSizeInBits();
     InVT = MVT::getVectorVT(MVT::i1, NumElts);
     In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
                      In, DAG.getIntPtrConstant(0, dl));
     WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
   }
 
   SDValue V;
   MVT WideEltVT = WideVT.getVectorElementType();
   if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
       (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
     V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
   } else {
     SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
     SDValue Zero = DAG.getConstant(0, dl, WideVT);
     V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
   }
 
   // Truncate if we had to extend i16/i8 above.
   if (VT != ExtVT) {
     WideVT = MVT::getVectorVT(VTElt, NumElts);
     V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
   }
 
   // Extract back to 128/256-bit if we widened.
   if (WideVT != VT)
     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
                     DAG.getIntPtrConstant(0, dl));
 
   return V;
 }
 
 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
 
   if (InVT.getVectorElementType() == MVT::i1)
     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
 
   assert(Subtarget.hasAVX() && "Expected AVX support");
   return LowerAVXExtend(Op, DAG, Subtarget);
 }
 
 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
 // For sign extend this needs to handle all vector sizes and SSE4.1 and
 // non-SSE4.1 targets. For zero extend this should only handle inputs of
 // MVT::v64i8 when BWI is not supported, but AVX512 is.
 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
   MVT VT = Op->getSimpleValueType(0);
   MVT InVT = In.getSimpleValueType();
 
   MVT SVT = VT.getVectorElementType();
   MVT InSVT = InVT.getVectorElementType();
   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
 
   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
     return SDValue();
   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
     return SDValue();
   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
       !(VT.is256BitVector() && Subtarget.hasAVX()) &&
       !(VT.is512BitVector() && Subtarget.hasAVX512()))
     return SDValue();
 
   SDLoc dl(Op);
   unsigned Opc = Op.getOpcode();
   unsigned NumElts = VT.getVectorNumElements();
 
   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
   // For 512-bit vectors, we need 128-bits or 256-bits.
   if (InVT.getSizeInBits() > 128) {
     // Input needs to be at least the same number of elements as output, and
     // at least 128-bits.
     int InSize = InSVT.getSizeInBits() * NumElts;
     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
     InVT = In.getSimpleValueType();
   }
 
   // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
   // need to be handled here for 256/512-bit results.
   if (Subtarget.hasInt256()) {
     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
 
     if (InVT.getVectorNumElements() != NumElts)
       return DAG.getNode(Op.getOpcode(), dl, VT, In);
 
     // FIXME: Apparently we create inreg operations that could be regular
     // extends.
     unsigned ExtOpc =
         Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND
                                              : ISD::ZERO_EXTEND;
     return DAG.getNode(ExtOpc, dl, VT, In);
   }
 
   // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
   if (Subtarget.hasAVX()) {
     assert(VT.is256BitVector() && "256-bit vector expected");
     int HalfNumElts = NumElts / 2;
     MVT HalfVT = MVT::getVectorVT(SVT, HalfNumElts);
 
     unsigned NumSrcElts = InVT.getVectorNumElements();
     SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
     for (int i = 0; i != HalfNumElts; ++i)
       HiMask[i] = HalfNumElts + i;
 
     SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
     SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
     Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
   }
 
   // We should only get here for sign extend.
   assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
   assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
 
   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
   SDValue Curr = In;
   SDValue SignExt = Curr;
 
   // As SRAI is only available on i16/i32 types, we expand only up to i32
   // and handle i64 separately.
   if (InVT != MVT::v4i32) {
     MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
 
     unsigned DestWidth = DestVT.getScalarSizeInBits();
     unsigned Scale = DestWidth / InSVT.getSizeInBits();
 
     unsigned InNumElts = InVT.getVectorNumElements();
     unsigned DestElts = DestVT.getVectorNumElements();
 
     // Build a shuffle mask that takes each input element and places it in the
     // MSBs of the new element size.
     SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
     for (unsigned i = 0; i != DestElts; ++i)
       Mask[i * Scale + (Scale - 1)] = i;
 
     Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
     Curr = DAG.getBitcast(DestVT, Curr);
 
     unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
     SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
                           DAG.getConstant(SignExtShift, dl, MVT::i8));
   }
 
   if (VT == MVT::v2i64) {
     assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
     SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
     SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
     SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
     SignExt = DAG.getBitcast(VT, SignExt);
   }
 
   return SignExt;
 }
 
 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   SDLoc dl(Op);
 
   if (InVT.getVectorElementType() == MVT::i1)
     return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
 
   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
   assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
          "Expected same number of elements");
   assert((VT.getVectorElementType() == MVT::i16 ||
           VT.getVectorElementType() == MVT::i32 ||
           VT.getVectorElementType() == MVT::i64) &&
          "Unexpected element type");
   assert((InVT.getVectorElementType() == MVT::i8 ||
           InVT.getVectorElementType() == MVT::i16 ||
           InVT.getVectorElementType() == MVT::i32) &&
          "Unexpected element type");
 
   // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
   if (InVT == MVT::v8i8) {
     if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
       return SDValue();
 
     In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
                      MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
     return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
   }
 
   if (Subtarget.hasInt256())
     return Op;
 
   // Optimize vectors in AVX mode
   // Sign extend  v8i16 to v8i32 and
   //              v4i32 to v4i64
   //
   // Divide input vector into two parts
   // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
   // concat the vectors to original VT
 
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
 
   SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
 
   unsigned NumElems = InVT.getVectorNumElements();
   SmallVector<int,8> ShufMask(NumElems, -1);
   for (unsigned i = 0; i != NumElems/2; ++i)
     ShufMask[i] = i + NumElems/2;
 
   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
   OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
   SDLoc dl(St);
   SDValue StoredVal = St->getValue();
 
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
   if (StoredVal.getValueType().isVector() &&
       StoredVal.getValueType().getVectorElementType() == MVT::i1) {
     assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
            "Unexpected VT");
     assert(!St->isTruncatingStore() && "Expected non-truncating store");
     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
            "Expected AVX512F without AVX512DQI");
 
     StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
                             DAG.getUNDEF(MVT::v16i1), StoredVal,
                             DAG.getIntPtrConstant(0, dl));
     StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
 
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                         St->getPointerInfo(), St->getAlignment(),
                         St->getMemOperand()->getFlags());
   }
 
   if (St->isTruncatingStore())
     return SDValue();
 
   MVT StoreVT = StoredVal.getSimpleValueType();
   assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
          "Unexpected VT");
   if (DAG.getTargetLoweringInfo().getTypeAction(*DAG.getContext(), StoreVT) !=
         TargetLowering::TypeWidenVector)
     return SDValue();
 
   // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
   // and store it.
   MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
                                 StoreVT.getVectorNumElements() * 2);
   StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
                           DAG.getUNDEF(StoreVT));
   MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
   MVT CastVT = MVT::getVectorVT(StVT, 2);
   StoredVal = DAG.getBitcast(CastVT, StoredVal);
   StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
                           DAG.getIntPtrConstant(0, dl));
 
   return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                       St->getPointerInfo(), St->getAlignment(),
                       St->getMemOperand()->getFlags());
 }
 
 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
 // may emit an illegal shuffle but the expansion is still better than scalar
 // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
 // we'll emit a shuffle and a arithmetic shift.
 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
 // TODO: It is possible to support ZExt by zeroing the undef values during
 // the shuffle phase or after the shuffle.
 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   MVT RegVT = Op.getSimpleValueType();
   assert(RegVT.isVector() && "We only custom lower vector loads.");
   assert(RegVT.isInteger() &&
          "We only custom lower integer vector loads.");
 
   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
   SDLoc dl(Ld);
   EVT MemVT = Ld->getMemoryVT();
 
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
   if (RegVT.getVectorElementType() == MVT::i1) {
     assert(EVT(RegVT) == MemVT && "Expected non-extending load");
     assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
     assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
            "Expected AVX512F without AVX512DQI");
 
     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
                                 Ld->getPointerInfo(), Ld->getAlignment(),
                                 Ld->getMemOperand()->getFlags());
 
     // Replace chain users with the new chain.
     assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
 
     SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
                       DAG.getBitcast(MVT::v16i1, Val),
                       DAG.getIntPtrConstant(0, dl));
     return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
   }
 
   // Nothing useful we can do without SSE2 shuffles.
   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned RegSz = RegVT.getSizeInBits();
 
   ISD::LoadExtType Ext = Ld->getExtensionType();
 
   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
          && "Only anyext and sext are currently implemented.");
   assert(MemVT != RegVT && "Cannot extend to the same type");
   assert(MemVT.isVector() && "Must load a vector from memory");
 
   unsigned NumElems = RegVT.getVectorNumElements();
   unsigned MemSz = MemVT.getSizeInBits();
   assert(RegSz > MemSz && "Register size must be greater than the mem size");
 
   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
     // The only way in which we have a legal 256-bit vector result but not the
     // integer 256-bit operations needed to directly lower a sextload is if we
     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
     // a 128-bit vector and a normal sign_extend to 256-bits that should get
     // correctly legalized. We do this late to allow the canonical form of
     // sextload to persist throughout the rest of the DAG combiner -- it wants
     // to fold together any extensions it can, and so will fuse a sign_extend
     // of an sextload into a sextload targeting a wider value.
     SDValue Load;
     if (MemSz == 128) {
       // Just switch this to a normal load.
       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
                                        "it must be a legal 128-bit vector "
                                        "type!");
       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
                          Ld->getPointerInfo(), Ld->getAlignment(),
                          Ld->getMemOperand()->getFlags());
     } else {
       assert(MemSz < 128 &&
              "Can't extend a type wider than 128 bits to a 256 bit vector!");
       // Do an sext load to a 128-bit vector type. We want to use the same
       // number of elements, but elements half as wide. This will end up being
       // recursively lowered by this routine, but will succeed as we definitely
       // have all the necessary features if we're using AVX1.
       EVT HalfEltVT =
           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
       Load =
           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
                          Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
                          Ld->getMemOperand()->getFlags());
     }
 
     // Replace chain users with the new chain.
     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
 
     // Finally, do a normal sign-extend to the desired register.
     SDValue SExt = DAG.getSExtOrTrunc(Load, dl, RegVT);
     return DAG.getMergeValues({SExt, Load.getValue(1)}, dl);
   }
 
   // All sizes must be a power of two.
   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
          "Non-power-of-two elements are not custom lowered!");
 
   // Attempt to load the original value using scalar loads.
   // Find the largest scalar type that divides the total loaded size.
   MVT SclrLoadTy = MVT::i8;
   for (MVT Tp : MVT::integer_valuetypes()) {
     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
       SclrLoadTy = Tp;
     }
   }
 
   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
       (64 <= MemSz))
     SclrLoadTy = MVT::f64;
 
   // Calculate the number of scalar loads that we need to perform
   // in order to load our vector from memory.
   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
 
   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
          "Can only lower sext loads with a single scalar load!");
 
   unsigned loadRegSize = RegSz;
   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
     loadRegSize = 128;
 
   // If we don't have BWI we won't be able to create the shuffle needed for
   // v8i8->v8i64.
   if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
       MemVT == MVT::v8i8)
     loadRegSize = 128;
 
   // Represent our vector as a sequence of elements which are the
   // largest scalar that we can load.
   EVT LoadUnitVecVT = EVT::getVectorVT(
       *DAG.getContext(), SclrLoadTy, loadRegSize / SclrLoadTy.getSizeInBits());
 
   // Represent the data using the same element type that is stored in
   // memory. In practice, we ''widen'' MemVT.
   EVT WideVecVT =
       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                        loadRegSize / MemVT.getScalarSizeInBits());
 
   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
          "Invalid vector type");
 
   // We can't shuffle using an illegal type.
   assert(TLI.isTypeLegal(WideVecVT) &&
          "We only lower types that form legal widened vector types");
 
   SmallVector<SDValue, 8> Chains;
   SDValue Ptr = Ld->getBasePtr();
   unsigned OffsetInc = SclrLoadTy.getSizeInBits() / 8;
   SDValue Increment = DAG.getConstant(OffsetInc, dl,
                                       TLI.getPointerTy(DAG.getDataLayout()));
   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
 
   unsigned Offset = 0;
   for (unsigned i = 0; i < NumLoads; ++i) {
     unsigned NewAlign = MinAlign(Ld->getAlignment(), Offset);
 
     // Perform a single load.
     SDValue ScalarLoad =
       DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr,
                   Ld->getPointerInfo().getWithOffset(Offset),
                   NewAlign, Ld->getMemOperand()->getFlags());
     Chains.push_back(ScalarLoad.getValue(1));
     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
     // another round of DAGCombining.
     if (i == 0)
       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
     else
       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
 
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
     Offset += OffsetInc;
   }
 
   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 
   // Bitcast the loaded value to a vector of the original element type, in
   // the size of the target vector type.
   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
   unsigned SizeRatio = RegSz / MemSz;
 
   if (Ext == ISD::SEXTLOAD) {
     SDValue Sext = getExtendInVec(/*Signed*/true, dl, RegVT, SlicedVec, DAG);
     return DAG.getMergeValues({Sext, TF}, dl);
   }
 
   if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
       MemVT == MVT::v8i8) {
     SDValue Sext = getExtendInVec(/*Signed*/false, dl, RegVT, SlicedVec, DAG);
     return DAG.getMergeValues({Sext, TF}, dl);
   }
 
   // Redistribute the loaded elements into the different locations.
   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   for (unsigned i = 0; i != NumElems; ++i)
     ShuffleVec[i * SizeRatio] = i;
 
   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
 
   // Bitcast to the requested type.
   Shuff = DAG.getBitcast(RegVT, Shuff);
   return DAG.getMergeValues({Shuff, TF}, dl);
 }
 
 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
 /// each of which has no other use apart from the AND / OR.
 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
   Opc = Op.getOpcode();
   if (Opc != ISD::OR && Opc != ISD::AND)
     return false;
   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
           Op.getOperand(0).hasOneUse() &&
           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
           Op.getOperand(1).hasOneUse());
 }
 
 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
 /// SETCC node has a single use.
 static bool isXor1OfSetCC(SDValue Op) {
   if (Op.getOpcode() != ISD::XOR)
     return false;
   if (isOneConstant(Op.getOperand(1)))
     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
            Op.getOperand(0).hasOneUse();
   return false;
 }
 
 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   bool addTest = true;
   SDValue Chain = Op.getOperand(0);
   SDValue Cond  = Op.getOperand(1);
   SDValue Dest  = Op.getOperand(2);
   SDLoc dl(Op);
   SDValue CC;
   bool Inverted = false;
 
   if (Cond.getOpcode() == ISD::SETCC) {
     // Check for setcc([su]{add,sub,mul}o == 0).
     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
         isNullConstant(Cond.getOperand(1)) &&
         Cond.getOperand(0).getResNo() == 1 &&
         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
       Inverted = true;
       Cond = Cond.getOperand(0);
     } else {
       if (SDValue NewCond = LowerSETCC(Cond, DAG))
         Cond = NewCond;
     }
   }
 #if 0
   // FIXME: LowerXALUO doesn't handle these!!
   else if (Cond.getOpcode() == X86ISD::ADD  ||
            Cond.getOpcode() == X86ISD::SUB  ||
            Cond.getOpcode() == X86ISD::SMUL ||
            Cond.getOpcode() == X86ISD::UMUL)
     Cond = LowerXALUO(Cond, DAG);
 #endif
 
   // Look pass (and (setcc_carry (cmp ...)), 1).
   if (Cond.getOpcode() == ISD::AND &&
       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
       isOneConstant(Cond.getOperand(1)))
     Cond = Cond.getOperand(0);
 
   // If condition flag is set by a X86ISD::CMP, then use it as the condition
   // setting operand in place of the X86ISD::SETCC.
   unsigned CondOpcode = Cond.getOpcode();
   if (CondOpcode == X86ISD::SETCC ||
       CondOpcode == X86ISD::SETCC_CARRY) {
     CC = Cond.getOperand(0);
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
       Cond = Cmp;
       addTest = false;
     } else {
       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
       default: break;
       case X86::COND_O:
       case X86::COND_B:
         // These can only come from an arithmetic instruction with overflow,
         // e.g. SADDO, UADDO.
         Cond = Cond.getOperand(1);
         addTest = false;
         break;
       }
     }
   }
   CondOpcode = Cond.getOpcode();
   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
       CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
     SDValue Value;
     X86::CondCode X86Cond;
     std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
 
     if (Inverted)
       X86Cond = X86::GetOppositeBranchCondition(X86Cond);
 
     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
     addTest = false;
   } else {
     unsigned CondOpc;
     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
       SDValue Cmp = Cond.getOperand(0).getOperand(1);
       if (CondOpc == ISD::OR) {
         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
         // two branches instead of an explicit OR instruction with a
         // separate test.
         if (Cmp == Cond.getOperand(1).getOperand(1) &&
             isX86LogicalCmp(Cmp)) {
           CC = Cond.getOperand(0).getOperand(0);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
           CC = Cond.getOperand(1).getOperand(0);
           Cond = Cmp;
           addTest = false;
         }
       } else { // ISD::AND
         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
         // two branches instead of an explicit AND instruction with a
         // separate test. However, we only do this if this block doesn't
         // have a fall-through edge, because this requires an explicit
         // jmp when the condition is false.
         if (Cmp == Cond.getOperand(1).getOperand(1) &&
             isX86LogicalCmp(Cmp) &&
             Op.getNode()->hasOneUse()) {
           X86::CondCode CCode =
             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
           CCode = X86::GetOppositeBranchCondition(CCode);
           CC = DAG.getConstant(CCode, dl, MVT::i8);
           SDNode *User = *Op.getNode()->use_begin();
           // Look for an unconditional branch following this conditional branch.
           // We need this because we need to reverse the successors in order
           // to implement FCMP_OEQ.
           if (User->getOpcode() == ISD::BR) {
             SDValue FalseBB = User->getOperand(1);
             SDNode *NewBR =
               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
             assert(NewBR == User);
             (void)NewBR;
             Dest = FalseBB;
 
             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                                 Chain, Dest, CC, Cmp);
             X86::CondCode CCode =
               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
             CCode = X86::GetOppositeBranchCondition(CCode);
             CC = DAG.getConstant(CCode, dl, MVT::i8);
             Cond = Cmp;
             addTest = false;
           }
         }
       }
     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
       // It should be transformed during dag combiner except when the condition
       // is set by a arithmetics with overflow node.
       X86::CondCode CCode =
         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
       CCode = X86::GetOppositeBranchCondition(CCode);
       CC = DAG.getConstant(CCode, dl, MVT::i8);
       Cond = Cond.getOperand(0).getOperand(1);
       addTest = false;
     } else if (Cond.getOpcode() == ISD::SETCC &&
                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
       // For FCMP_OEQ, we can emit
       // two branches instead of an explicit AND instruction with a
       // separate test. However, we only do this if this block doesn't
       // have a fall-through edge, because this requires an explicit
       // jmp when the condition is false.
       if (Op.getNode()->hasOneUse()) {
         SDNode *User = *Op.getNode()->use_begin();
         // Look for an unconditional branch following this conditional branch.
         // We need this because we need to reverse the successors in order
         // to implement FCMP_OEQ.
         if (User->getOpcode() == ISD::BR) {
           SDValue FalseBB = User->getOperand(1);
           SDNode *NewBR =
             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
           assert(NewBR == User);
           (void)NewBR;
           Dest = FalseBB;
 
           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                     Cond.getOperand(0), Cond.getOperand(1));
           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                               Chain, Dest, CC, Cmp);
           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
           Cond = Cmp;
           addTest = false;
         }
       }
     } else if (Cond.getOpcode() == ISD::SETCC &&
                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
       // For FCMP_UNE, we can emit
       // two branches instead of an explicit OR instruction with a
       // separate test.
       SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                                 Cond.getOperand(0), Cond.getOperand(1));
       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
       CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
       Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                           Chain, Dest, CC, Cmp);
       CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
       Cond = Cmp;
       addTest = false;
     }
   }
 
   if (addTest) {
     // Look pass the truncate if the high bits are known zero.
     if (isTruncWithZeroHighBitsInput(Cond, DAG))
         Cond = Cond.getOperand(0);
 
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
       SDValue BTCC;
       if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
         CC = BTCC;
         Cond = BT;
         addTest = false;
       }
     }
   }
 
   if (addTest) {
     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
     Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
                    X86Cond, dl, DAG);
   }
   Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
                      Chain, Dest, CC, Cond);
 }
 
 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
 // Calls to _alloca are needed to probe the stack when allocating more than 4k
 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
 // that the guard pages used by the OS virtual memory manager are allocated in
 // correct sequence.
 SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
   bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
                SplitStack || EmitStackProbe;
   SDLoc dl(Op);
 
   // Get the inputs.
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   EVT VT = Node->getValueType(0);
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   bool Is64Bit = Subtarget.is64Bit();
   MVT SPTy = getPointerTy(DAG.getDataLayout());
 
   SDValue Result;
   if (!Lower) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
                     " not tell us which reg is the stack pointer!");
 
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
     Chain = SP.getValue(1);
     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
     unsigned StackAlign = TFI.getStackAlignment();
     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
     if (Align > StackAlign)
       Result = DAG.getNode(ISD::AND, dl, VT, Result,
                          DAG.getConstant(-(uint64_t)Align, dl, VT));
     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
   } else if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
     if (Is64Bit) {
       // The 64 bit implementation of segmented stacks needs to clobber both r10
       // r11. This makes it impossible to use it along with nested parameters.
       const Function &F = MF.getFunction();
       for (const auto &A : F.args()) {
         if (A.hasNestAttr())
           report_fatal_error("Cannot use segmented stacks with functions that "
                              "have nested arguments.");
       }
     }
 
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
   } else {
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
 
     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     unsigned SPReg = RegInfo->getStackRegister();
     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
     Chain = SP.getValue(1);
 
     if (Align) {
       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
                        DAG.getConstant(-(uint64_t)Align, dl, VT));
       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
     }
 
     Result = SP;
   }
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
 
   SDValue Ops[2] = {Result, Chain};
   return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   auto PtrVT = getPointerTy(MF.getDataLayout());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   SDLoc DL(Op);
 
   if (!Subtarget.is64Bit() ||
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                         MachinePointerInfo(SV));
   }
 
   // __va_list_tag:
   //   gp_offset         (0 - 6 * 8)
   //   fp_offset         (48 - 48 + 8 * 16)
   //   overflow_arg_area (point to parameters coming in memory).
   //   reg_save_area
   SmallVector<SDValue, 8> MemOps;
   SDValue FIN = Op.getOperand(1);
   // Store gp_offset
   SDValue Store = DAG.getStore(
       Op.getOperand(0), DL,
       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
       MachinePointerInfo(SV));
   MemOps.push_back(Store);
 
   // Store fp_offset
   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
   Store = DAG.getStore(
       Op.getOperand(0), DL,
       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
       MachinePointerInfo(SV, 4));
   MemOps.push_back(Store);
 
   // Store ptr to overflow_arg_area
   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   Store =
       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
   MemOps.push_back(Store);
 
   // Store ptr to reg_save_area.
   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
   Store = DAG.getStore(
       Op.getOperand(0), DL, RSFIN, FIN,
       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
   MemOps.push_back(Store);
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget.is64Bit() &&
          "LowerVAARG only handles 64-bit va_arg!");
   assert(Op.getNumOperands() == 4);
 
   MachineFunction &MF = DAG.getMachineFunction();
   if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
     // The Win64 ABI uses char* instead of a structure.
     return DAG.expandVAArg(Op.getNode());
 
   SDValue Chain = Op.getOperand(0);
   SDValue SrcPtr = Op.getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   unsigned Align = Op.getConstantOperandVal(3);
   SDLoc dl(Op);
 
   EVT ArgVT = Op.getNode()->getValueType(0);
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
   uint8_t ArgMode;
 
   // Decide which area this value should be read from.
   // TODO: Implement the AMD64 ABI in its entirety. This simple
   // selection mechanism works only for the basic types.
   if (ArgVT == MVT::f80) {
     llvm_unreachable("va_arg for f80 not yet implemented");
   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   } else {
     llvm_unreachable("Unhandled argument type in LowerVAARG");
   }
 
   if (ArgMode == 2) {
     // Sanity Check: Make sure using fp_offset makes sense.
     assert(!Subtarget.useSoftFloat() &&
            !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
            Subtarget.hasSSE1());
   }
 
   // Insert VAARG_64 node into the DAG
   // VAARG_64 returns two values: Variable Argument Address, Chain
   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
                        DAG.getConstant(ArgMode, dl, MVT::i8),
                        DAG.getConstant(Align, dl, MVT::i32)};
   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(
     X86ISD::VAARG_64, dl,
     VTs, InstOps, MVT::i64,
     MachinePointerInfo(SV),
     /*Align=*/0,
     MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
   Chain = VAARG.getValue(1);
 
   // Load the next argument and return it
   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
 }
 
 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
   // where a va_list is still an i8*.
   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
   if (Subtarget.isCallingConvWin64(
         DAG.getMachineFunction().getFunction().getCallingConv()))
     // Probably a Win64 va_copy.
     return DAG.expandVACopy(Op.getNode());
 
   SDValue Chain = Op.getOperand(0);
   SDValue DstPtr = Op.getOperand(1);
   SDValue SrcPtr = Op.getOperand(2);
   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   SDLoc DL(Op);
 
   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
                        false, false,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
 // Helper to get immediate/variable SSE shift opcode from other shift opcodes.
 static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
   switch (Opc) {
   case ISD::SHL:
   case X86ISD::VSHL:
   case X86ISD::VSHLI:
     return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
   case ISD::SRL:
   case X86ISD::VSRL:
   case X86ISD::VSRLI:
     return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
   case ISD::SRA:
   case X86ISD::VSRA:
   case X86ISD::VSRAI:
     return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
   }
   llvm_unreachable("Unknown target vector shift node");
 }
 
 /// Handle vector element shifts where the shift amount is a constant.
 /// Takes immediate version of shift as input.
 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                           SDValue SrcOp, uint64_t ShiftAmt,
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
 
   // Bitcast the source vector to the output type, this is mainly necessary for
   // vXi8/vXi64 shifts.
   if (VT != SrcOp.getSimpleValueType())
     SrcOp = DAG.getBitcast(VT, SrcOp);
 
   // Fold this packed shift into its first operand if ShiftAmt is 0.
   if (ShiftAmt == 0)
     return SrcOp;
 
   // Check for ShiftAmt >= element width
   if (ShiftAmt >= ElementType.getSizeInBits()) {
     if (Opc == X86ISD::VSRAI)
       ShiftAmt = ElementType.getSizeInBits() - 1;
     else
       return DAG.getConstant(0, dl, VT);
   }
 
   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
          && "Unknown target vector shift-by-constant node");
 
   // Fold this packed vector shift into a build vector if SrcOp is a
   // vector of Constants or UNDEFs.
   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
     SmallVector<SDValue, 8> Elts;
     unsigned NumElts = SrcOp->getNumOperands();
     ConstantSDNode *ND;
 
     switch(Opc) {
     default: llvm_unreachable("Unknown opcode!");
     case X86ISD::VSHLI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
         ND = cast<ConstantSDNode>(CurrentOp);
         const APInt &C = ND->getAPIntValue();
         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
       }
       break;
     case X86ISD::VSRLI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
         ND = cast<ConstantSDNode>(CurrentOp);
         const APInt &C = ND->getAPIntValue();
         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
       }
       break;
     case X86ISD::VSRAI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
           Elts.push_back(CurrentOp);
           continue;
         }
         ND = cast<ConstantSDNode>(CurrentOp);
         const APInt &C = ND->getAPIntValue();
         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
       }
       break;
     }
 
     return DAG.getBuildVector(VT, dl, Elts);
   }
 
   return DAG.getNode(Opc, dl, VT, SrcOp,
                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
 }
 
 /// Handle vector element shifts where the shift amount may or may not be a
 /// constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
                                    const X86Subtarget &Subtarget,
                                    SelectionDAG &DAG) {
   MVT SVT = ShAmt.getSimpleValueType();
   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
 
   // Catch shift-by-constant.
   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
                                       CShAmt->getZExtValue(), DAG);
 
   // Change opcode to non-immediate version.
   Opc = getTargetVShiftUniformOpcode(Opc, true);
 
   // Need to build a vector containing shift amount.
   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
   // +====================+============+=======================================+
   // | ShAmt is           | HasSSE4.1? | Construct ShAmt vector as             |
   // +====================+============+=======================================+
   // | i64                | Yes, No    | Use ShAmt as lowest elt               |
   // | i32                | Yes        | zero-extend in-reg                    |
   // | (i32 zext(i16/i8)) | Yes        | zero-extend in-reg                    |
   // | (i32 zext(i16/i8)) | No         | byte-shift-in-reg                     |
   // | i16/i32            | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
   // +====================+============+=======================================+
 
   if (SVT == MVT::i64)
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
   else if (ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
            ShAmt.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
            (ShAmt.getOperand(0).getSimpleValueType() == MVT::i16 ||
             ShAmt.getOperand(0).getSimpleValueType() == MVT::i8)) {
     ShAmt = ShAmt.getOperand(0);
     MVT AmtTy = ShAmt.getSimpleValueType() == MVT::i8 ? MVT::v16i8 : MVT::v8i16;
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), AmtTy, ShAmt);
     if (Subtarget.hasSSE41())
       ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
                           MVT::v2i64, ShAmt);
     else {
       SDValue ByteShift = DAG.getConstant(
           (128 - AmtTy.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
       ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
       ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
                           ByteShift);
       ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
                           ByteShift);
     }
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
     ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
                         MVT::v2i64, ShAmt);
   } else {
     SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), DAG.getUNDEF(SVT),
                         DAG.getUNDEF(SVT)};
     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
   }
 
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
   MVT EltVT = VT.getVectorElementType();
   MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
 
   ShAmt = DAG.getBitcast(ShVT, ShAmt);
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
 /// Return Mask with the necessary casting or extending
 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
                            const SDLoc &dl) {
 
   if (isAllOnesConstant(Mask))
     return DAG.getConstant(1, dl, MaskVT);
   if (X86::isZeroNode(Mask))
     return DAG.getConstant(0, dl, MaskVT);
 
   assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
 
   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
     assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
     assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
     // In case 32bit mode, bitcast i64 is illegal, extend/split it.
     SDValue Lo, Hi;
     Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
                         DAG.getConstant(0, dl, MVT::i32));
     Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
                         DAG.getConstant(1, dl, MVT::i32));
 
     Lo = DAG.getBitcast(MVT::v32i1, Lo);
     Hi = DAG.getBitcast(MVT::v32i1, Hi);
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   } else {
     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
                                      Mask.getSimpleValueType().getSizeInBits());
     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
     // are extracted by EXTRACT_SUBVECTOR.
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
                        DAG.getBitcast(BitcastVT, Mask),
                        DAG.getIntPtrConstant(0, dl));
   }
 }
 
 /// Return (and \p Op, \p Mask) for compare instructions or
 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
 /// necessary casting or extending for \p Mask when lowering masking intrinsics
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
                   SDValue PreservedSrc,
                   const X86Subtarget &Subtarget,
                   SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   unsigned OpcodeSelect = ISD::VSELECT;
   SDLoc dl(Op);
 
   if (isAllOnesConstant(Mask))
     return Op;
 
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
   if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
 }
 
 /// Creates an SDNode for a predicated scalar operation.
 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
 /// The mask is coming as MVT::i8 and it should be transformed
 /// to MVT::v1i1 while lowering masking intrinsics.
 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
 /// "X86select" instead of "vselect". We just can't create the "vselect" node
 /// for a scalar instruction.
 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
 
   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
     if (MaskConst->getZExtValue() & 0x1)
       return Op;
 
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
 
   assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
   SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
                               DAG.getBitcast(MVT::v8i1, Mask),
                               DAG.getIntPtrConstant(0, dl));
   if (Op.getOpcode() == X86ISD::FSETCCM ||
       Op.getOpcode() == X86ISD::FSETCCM_RND ||
       Op.getOpcode() == X86ISD::VFPCLASSS)
     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
 
   if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
 }
 
 static int getSEHRegistrationNodeSize(const Function *Fn) {
   if (!Fn->hasPersonalityFn())
     report_fatal_error(
         "querying registration node size for function without personality");
   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
   // WinEHStatePass for the full struct definition.
   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
   case EHPersonality::MSVC_X86SEH: return 24;
   case EHPersonality::MSVC_CXX: return 16;
   default: break;
   }
   report_fatal_error(
       "can only recover FP for 32-bit MSVC EH personality functions");
 }
 
 /// When the MSVC runtime transfers control to us, either to an outlined
 /// function or when returning to a parent frame after catching an exception, we
 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
 /// Here's the math:
 ///   RegNodeBase = EntryEBP - RegNodeSize
 ///   ParentFP = RegNodeBase - ParentFrameOffset
 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
 /// subtracting the offset (negative on x86) takes us back to the parent FP.
 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
                                    SDValue EntryEBP) {
   MachineFunction &MF = DAG.getMachineFunction();
   SDLoc dl;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
 
   // It's possible that the parent function no longer has a personality function
   // if the exceptional code was optimized away, in which case we just return
   // the incoming EBP.
   if (!Fn->hasPersonalityFn())
     return EntryEBP;
 
   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
   // registration, or the .set_setframe offset.
   MCSymbol *OffsetSym =
       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
   SDValue ParentFrameOffset =
       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
 
   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
   // prologue to RBP in the parent function.
   const X86Subtarget &Subtarget =
       static_cast<const X86Subtarget &>(DAG.getSubtarget());
   if (Subtarget.is64Bit())
     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
 
   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
   // RegNodeBase = EntryEBP - RegNodeSize
   // ParentFP = RegNodeBase - ParentFrameOffset
   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
 }
 
 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
   auto isRoundModeCurDirection = [](SDValue Rnd) {
     if (!isa<ConstantSDNode>(Rnd))
       return false;
 
     unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
     return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
   };
 
   SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   MVT VT = Op.getSimpleValueType();
   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
   if (IntrData) {
     switch(IntrData->Type) {
     case INTR_TYPE_1OP: {
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(2);
         if (!isRoundModeCurDirection(Rnd)) {
           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
                              Op.getOperand(1), Rnd);
         }
       }
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
     }
     case INTR_TYPE_2OP: {
       SDValue Src2 = Op.getOperand(2);
 
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(3);
         if (!isRoundModeCurDirection(Rnd)) {
           return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
                              Op.getOperand(1), Src2, Rnd);
         }
       }
 
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), Src2);
     }
     case INTR_TYPE_3OP:
     case INTR_TYPE_3OP_IMM8: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
 
       if (IntrData->Type == INTR_TYPE_3OP_IMM8)
         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
 
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(4);
         if (!isRoundModeCurDirection(Rnd)) {
           return DAG.getNode(IntrWithRoundingModeOpcode,
                              dl, Op.getValueType(),
                              Src1, Src2, Src3, Rnd);
         }
       }
 
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Src1, Src2, Src3);
     }
     case INTR_TYPE_4OP:
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
     case INTR_TYPE_1OP_MASK_RM: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       SDValue RoundingMode;
       // We always add rounding mode to the Node.
       // If the rounding mode is not specified, we add the
       // "current direction" mode.
       if (Op.getNumOperands() == 4)
         RoundingMode =
           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       else
         RoundingMode = Op.getOperand(4);
       assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
                                               RoundingMode),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_1OP_MASK: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       // We add rounding mode to the Node when
       //   - RM Opcode is specified and
       //   - RM is not "current direction".
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(4);
         if (!isRoundModeCurDirection(Rnd)) {
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(),
                                       Src, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_SCALAR_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue passThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       // There are 2 kinds of intrinsics in this group:
       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
       // (2) With rounding mode and sae - 7 operands.
       bool HasRounding = IntrWithRoundingModeOpcode != 0;
       if (Op.getNumOperands() == (5U + HasRounding)) {
         if (HasRounding) {
           SDValue Rnd = Op.getOperand(5);
           if (!isRoundModeCurDirection(Rnd))
             return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                                     dl, VT, Src1, Src2, Rnd),
                                         Mask, passThru, Subtarget, DAG);
         }
         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
                                                 Src2),
                                     Mask, passThru, Subtarget, DAG);
       }
 
       assert(Op.getNumOperands() == (6U + HasRounding) &&
              "Unexpected intrinsic form");
       SDValue RoundingMode = Op.getOperand(5);
       if (HasRounding) {
         SDValue Sae = Op.getOperand(6);
         if (!isRoundModeCurDirection(Sae))
           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                                   dl, VT, Src1, Src2,
                                                   RoundingMode, Sae),
                                       Mask, passThru, Subtarget, DAG);
       }
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
                                               Src2, RoundingMode),
                                   Mask, passThru, Subtarget, DAG);
     }
     case INTR_TYPE_SCALAR_MASK_RM: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src0 = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       // There are 2 kinds of intrinsics in this group:
       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
       // (2) With rounding mode and sae - 7 operands.
       if (Op.getNumOperands() == 6) {
         SDValue Sae  = Op.getOperand(5);
         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
                                                 Sae),
                                     Mask, Src0, Subtarget, DAG);
       }
       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
       SDValue RoundingMode  = Op.getOperand(5);
       SDValue Sae  = Op.getOperand(6);
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
                                               RoundingMode, Sae),
                                   Mask, Src0, Subtarget, DAG);
     }
     case INTR_TYPE_2OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
 
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(5);
         if (!isRoundModeCurDirection(Rnd)) {
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(),
                                       Src1, Src2, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
       // TODO: Intrinsics should have fast-math-flags to propagate.
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_2OP_MASK_RM: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
       // We specify 2 possible modes for intrinsics, with/without rounding
       // modes.
       // First, we check if the intrinsic have rounding mode (6 operands),
       // if not, we set rounding mode to "current".
       SDValue Rnd;
       if (Op.getNumOperands() == 6)
         Rnd = Op.getOperand(5);
       else
         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               Src1, Src2, Rnd),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_3OP_SCALAR_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue PassThru = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
 
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(6);
         if (!isRoundModeCurDirection(Rnd))
           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                                   dl, VT, Src1, Src2, Src3, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
       }
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
                                               Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case INTR_TYPE_3OP_MASK: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue PassThru = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
 
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(6);
         if (!isRoundModeCurDirection(Rnd)) {
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(),
                                       Src1, Src2, Src3, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               Src1, Src2, Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case VPERM_2OP : {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
 
       // Swap Src1 and Src2 in the node creation
       return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
     }
     case IFMA_OP:
       // NOTE: We need to swizzle the operands to pass the multiply operands
       // first.
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
     case CVTPD2PS:
       // ISD::FP_ROUND has a second argument that indicates if the truncation
       // does not change the value. Set it to 0 since it can change.
       return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
                          DAG.getIntPtrConstant(0, dl));
     case CVTPD2PS_RND_MASK: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       // We add rounding mode to the Node when
       //   - RM Opcode is specified and
       //   - RM is not "current direction".
       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
       if (IntrWithRoundingModeOpcode != 0) {
         SDValue Rnd = Op.getOperand(4);
         if (!isRoundModeCurDirection(Rnd)) {
           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
                                       dl, Op.getValueType(),
                                       Src, Rnd),
                                       Mask, PassThru, Subtarget, DAG);
         }
       }
       assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
       // ISD::FP_ROUND has a second argument that indicates if the truncation
       // does not change the value. Set it to 0 since it can change.
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
                                               DAG.getIntPtrConstant(0, dl)),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case FPCLASSS: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Imm = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
                                                  Subtarget, DAG);
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
                                 DAG.getConstant(0, dl, MVT::v8i1),
                                 FPclassMask, DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(MVT::i8, Ins);
     }
 
     case CMP_MASK_CC: {
       MVT MaskVT = Op.getSimpleValueType();
       SDValue Cmp;
       SDValue CC = Op.getOperand(3);
       CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
       // We specify 2 possible opcodes for intrinsics with rounding modes.
       // First, we check if the intrinsic may have non-default rounding mode,
       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
       if (IntrData->Opc1 != 0) {
         SDValue Rnd = Op.getOperand(4);
         if (!isRoundModeCurDirection(Rnd))
           Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
                             Op.getOperand(2), CC, Rnd);
       }
       //default rounding mode
       if (!Cmp.getNode())
         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
                           Op.getOperand(2), CC);
 
       return Cmp;
     }
     case CMP_MASK_SCALAR_CC: {
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
       SDValue Mask = Op.getOperand(4);
 
       SDValue Cmp;
       if (IntrData->Opc1 != 0) {
         SDValue Rnd = Op.getOperand(5);
         if (!isRoundModeCurDirection(Rnd))
           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
       }
       //default rounding mode
       if(!Cmp.getNode())
         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
 
       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
                                              Subtarget, DAG);
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
                                 DAG.getConstant(0, dl, MVT::v8i1),
                                 CmpMask, DAG.getIntPtrConstant(0, dl));
       return DAG.getBitcast(MVT::i8, Ins);
     }
     case COMI: { // Comparison intrinsics
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
       SDValue LHS = Op.getOperand(1);
       SDValue RHS = Op.getOperand(2);
       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
       SDValue SetCC;
       switch (CC) {
       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
         break;
       }
       case ISD::SETNE: { // (ZF = 1 or PF = 1)
         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
         break;
       }
       case ISD::SETGT: // (CF = 0 and ZF = 0)
         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
         break;
       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
         SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
         break;
       }
       case ISD::SETGE: // CF = 0
         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
         break;
       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
         SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
         break;
       default:
         llvm_unreachable("Unexpected illegal condition!");
       }
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
     }
     case COMI_RM: { // Comparison intrinsics with Sae
       SDValue LHS = Op.getOperand(1);
       SDValue RHS = Op.getOperand(2);
       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
       SDValue Sae = Op.getOperand(4);
 
       SDValue FCmp;
       if (isRoundModeCurDirection(Sae))
         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
                            DAG.getConstant(CondVal, dl, MVT::i8));
       else
         FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
                            DAG.getConstant(CondVal, dl, MVT::i8), Sae);
       // Need to fill with zeros to ensure the bitcast will produce zeroes
       // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
       SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
                                 DAG.getConstant(0, dl, MVT::v16i1),
                                 FCmp, DAG.getIntPtrConstant(0, dl));
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
                          DAG.getBitcast(MVT::i16, Ins));
     }
     case VSHIFT:
       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
                                  DAG);
     case COMPRESS_EXPAND_IN_REG: {
       SDValue Mask = Op.getOperand(3);
       SDValue DataToCompress = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       if (isAllOnesConstant(Mask)) // return data as is
         return Op.getOperand(1);
 
       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                               DataToCompress),
                                   Mask, PassThru, Subtarget, DAG);
     }
     case FIXUPIMMS:
     case FIXUPIMMS_MASKZ:
     case FIXUPIMM:
     case FIXUPIMM_MASKZ:{
       SDValue Src1 = Op.getOperand(1);
       SDValue Src2 = Op.getOperand(2);
       SDValue Src3 = Op.getOperand(3);
       SDValue Imm = Op.getOperand(4);
       SDValue Mask = Op.getOperand(5);
       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
       // We specify 2 possible modes for intrinsics, with/without rounding
       // modes.
       // First, we check if the intrinsic have rounding mode (7 operands),
       // if not, we set rounding mode to "current".
       SDValue Rnd;
       if (Op.getNumOperands() == 7)
         Rnd = Op.getOperand(6);
       else
         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                                 Src1, Src2, Src3, Imm, Rnd),
                                     Mask, Passthru, Subtarget, DAG);
       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
                                        Src1, Src2, Src3, Imm, Rnd),
                                     Mask, Passthru, Subtarget, DAG);
     }
     case ROUNDP: {
       assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
       // Clear the upper bits of the rounding immediate so that the legacy
       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
       SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
                                          Op.getOperand(2),
                                          DAG.getConstant(0xf, dl, MVT::i32));
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), RoundingMode);
     }
     case ROUNDS: {
       assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
       // Clear the upper bits of the rounding immediate so that the legacy
       // intrinsic can't trigger the scaling behavior of VRNDSCALE.
       SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
                                          Op.getOperand(3),
                                          DAG.getConstant(0xf, dl, MVT::i32));
       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
                          Op.getOperand(1), Op.getOperand(2), RoundingMode);
     }
     // ADC/ADCX/SBB
     case ADX: {
       SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
       SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
 
       SDValue Res;
       // If the carry in is zero, then we should just use ADD/SUB instead of
       // ADC/SBB.
       if (isNullConstant(Op.getOperand(1))) {
         Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
                           Op.getOperand(3));
       } else {
         SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
                                     DAG.getConstant(-1, dl, MVT::i8));
         Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
                           Op.getOperand(3), GenCF.getValue(1));
       }
       SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
       SDValue Results[] = { SetCC, Res };
       return DAG.getMergeValues(Results, dl);
     }
     case CVTPD2PS_MASK:
     case CVTPD2I_MASK:
     case TRUNCATE_TO_REG: {
       SDValue Src = Op.getOperand(1);
       SDValue PassThru = Op.getOperand(2);
       SDValue Mask = Op.getOperand(3);
 
       if (isAllOnesConstant(Mask))
         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
 
       MVT SrcVT = Src.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
                          Mask);
     }
     case CVTPS2PH_MASK: {
       SDValue Src = Op.getOperand(1);
       SDValue Rnd = Op.getOperand(2);
       SDValue PassThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
 
       if (isAllOnesConstant(Mask))
         return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src, Rnd);
 
       MVT SrcVT = Src.getSimpleValueType();
       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
       Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
       return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, Rnd,
                          PassThru, Mask);
 
     }
     default:
       break;
     }
   }
 
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
 
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
   // or testp pattern and a setcc for the result.
   case Intrinsic::x86_avx512_ktestc_b:
   case Intrinsic::x86_avx512_ktestc_w:
   case Intrinsic::x86_avx512_ktestc_d:
   case Intrinsic::x86_avx512_ktestc_q:
   case Intrinsic::x86_avx512_ktestz_b:
   case Intrinsic::x86_avx512_ktestz_w:
   case Intrinsic::x86_avx512_ktestz_d:
   case Intrinsic::x86_avx512_ktestz_q:
   case Intrinsic::x86_sse41_ptestz:
   case Intrinsic::x86_sse41_ptestc:
   case Intrinsic::x86_sse41_ptestnzc:
   case Intrinsic::x86_avx_ptestz_256:
   case Intrinsic::x86_avx_ptestc_256:
   case Intrinsic::x86_avx_ptestnzc_256:
   case Intrinsic::x86_avx_vtestz_ps:
   case Intrinsic::x86_avx_vtestc_ps:
   case Intrinsic::x86_avx_vtestnzc_ps:
   case Intrinsic::x86_avx_vtestz_pd:
   case Intrinsic::x86_avx_vtestc_pd:
   case Intrinsic::x86_avx_vtestnzc_pd:
   case Intrinsic::x86_avx_vtestz_ps_256:
   case Intrinsic::x86_avx_vtestc_ps_256:
   case Intrinsic::x86_avx_vtestnzc_ps_256:
   case Intrinsic::x86_avx_vtestz_pd_256:
   case Intrinsic::x86_avx_vtestc_pd_256:
   case Intrinsic::x86_avx_vtestnzc_pd_256: {
     unsigned TestOpc = X86ISD::PTEST;
     X86::CondCode X86CC;
     switch (IntNo) {
     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
     case Intrinsic::x86_avx512_ktestc_b:
     case Intrinsic::x86_avx512_ktestc_w:
     case Intrinsic::x86_avx512_ktestc_d:
     case Intrinsic::x86_avx512_ktestc_q:
       // CF = 1
       TestOpc = X86ISD::KTEST;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_avx512_ktestz_b:
     case Intrinsic::x86_avx512_ktestz_w:
     case Intrinsic::x86_avx512_ktestz_d:
     case Intrinsic::x86_avx512_ktestz_q:
       TestOpc = X86ISD::KTEST;
       X86CC = X86::COND_E;
       break;
     case Intrinsic::x86_avx_vtestz_ps:
     case Intrinsic::x86_avx_vtestz_pd:
     case Intrinsic::x86_avx_vtestz_ps_256:
     case Intrinsic::x86_avx_vtestz_pd_256:
       TestOpc = X86ISD::TESTP;
       LLVM_FALLTHROUGH;
     case Intrinsic::x86_sse41_ptestz:
     case Intrinsic::x86_avx_ptestz_256:
       // ZF = 1
       X86CC = X86::COND_E;
       break;
     case Intrinsic::x86_avx_vtestc_ps:
     case Intrinsic::x86_avx_vtestc_pd:
     case Intrinsic::x86_avx_vtestc_ps_256:
     case Intrinsic::x86_avx_vtestc_pd_256:
       TestOpc = X86ISD::TESTP;
       LLVM_FALLTHROUGH;
     case Intrinsic::x86_sse41_ptestc:
     case Intrinsic::x86_avx_ptestc_256:
       // CF = 1
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_avx_vtestnzc_ps:
     case Intrinsic::x86_avx_vtestnzc_pd:
     case Intrinsic::x86_avx_vtestnzc_ps_256:
     case Intrinsic::x86_avx_vtestnzc_pd_256:
       TestOpc = X86ISD::TESTP;
       LLVM_FALLTHROUGH;
     case Intrinsic::x86_sse41_ptestnzc:
     case Intrinsic::x86_avx_ptestnzc_256:
       // ZF and CF = 0
       X86CC = X86::COND_A;
       break;
     }
 
     SDValue LHS = Op.getOperand(1);
     SDValue RHS = Op.getOperand(2);
     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
   case Intrinsic::x86_sse42_pcmpestric128:
   case Intrinsic::x86_sse42_pcmpistrio128:
   case Intrinsic::x86_sse42_pcmpestrio128:
   case Intrinsic::x86_sse42_pcmpistris128:
   case Intrinsic::x86_sse42_pcmpestris128:
   case Intrinsic::x86_sse42_pcmpistriz128:
   case Intrinsic::x86_sse42_pcmpestriz128: {
     unsigned Opcode;
     X86::CondCode X86CC;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
     case Intrinsic::x86_sse42_pcmpistria128:
       Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_A;
       break;
     case Intrinsic::x86_sse42_pcmpestria128:
       Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_A;
       break;
     case Intrinsic::x86_sse42_pcmpistric128:
       Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_sse42_pcmpestric128:
       Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_B;
       break;
     case Intrinsic::x86_sse42_pcmpistrio128:
       Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_O;
       break;
     case Intrinsic::x86_sse42_pcmpestrio128:
       Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_O;
       break;
     case Intrinsic::x86_sse42_pcmpistris128:
       Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_S;
       break;
     case Intrinsic::x86_sse42_pcmpestris128:
       Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_S;
       break;
     case Intrinsic::x86_sse42_pcmpistriz128:
       Opcode = X86ISD::PCMPISTR;
       X86CC = X86::COND_E;
       break;
     case Intrinsic::x86_sse42_pcmpestriz128:
       Opcode = X86ISD::PCMPESTR;
       X86CC = X86::COND_E;
       break;
     }
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
     SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
   case Intrinsic::x86_sse42_pcmpistri128:
   case Intrinsic::x86_sse42_pcmpestri128: {
     unsigned Opcode;
     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
       Opcode = X86ISD::PCMPISTR;
     else
       Opcode = X86ISD::PCMPESTR;
 
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
 
   case Intrinsic::x86_sse42_pcmpistrm128:
   case Intrinsic::x86_sse42_pcmpestrm128: {
     unsigned Opcode;
     if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
       Opcode = X86ISD::PCMPISTR;
     else
       Opcode = X86ISD::PCMPESTR;
 
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
   }
 
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
     auto &Context = MF.getMMI().getContext();
     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
                                             Twine(MF.getFunctionNumber()));
     return DAG.getNode(getGlobalWrapperKind(), dl, VT,
                        DAG.getMCSymbol(S, PtrVT));
   }
 
   case Intrinsic::x86_seh_lsda: {
     // Compute the symbol for the LSDA. We know it'll get emitted later.
     MachineFunction &MF = DAG.getMachineFunction();
     SDValue Op1 = Op.getOperand(1);
     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
 
     // Generate a simple absolute symbol reference. This intrinsic is only
     // supported on 32-bit Windows, which isn't PIC.
     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
   }
 
   case Intrinsic::eh_recoverfp: {
     SDValue FnOp = Op.getOperand(1);
     SDValue IncomingFPOp = Op.getOperand(2);
     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
     if (!Fn)
       report_fatal_error(
           "llvm.eh.recoverfp must take a function as the first argument");
     return recoverFramePointer(DAG, Fn, IncomingFPOp);
   }
 
   case Intrinsic::localaddress: {
     // Returns one of the stack, base, or frame pointer registers, depending on
     // which is used to reference local variables.
     MachineFunction &MF = DAG.getMachineFunction();
     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     unsigned Reg;
     if (RegInfo->hasBasePointer(MF))
       Reg = RegInfo->getBaseRegister();
     else // This function handles the SP or FP case.
       Reg = RegInfo->getPtrSizedFrameRegister(MF);
     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
   }
   }
 }
 
 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                  SDValue Src, SDValue Mask, SDValue Base,
                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
                                  const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
   if (!C)
     return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   EVT MaskVT = Mask.getValueType();
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   // If source is undef or we know it won't be used, use a zero vector
   // to break register dependency.
   // TODO: use undef instead and let BreakFalseDeps deal with it?
   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   return DAG.getMergeValues(RetOps, dl);
 }
 
 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Src, SDValue Mask, SDValue Base,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
   if (!C)
     return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
                               VT.getVectorNumElements());
   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
 
   // We support two versions of the gather intrinsics. One with scalar mask and
   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
   if (Mask.getValueType() != MaskVT)
     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   // If source is undef or we know it won't be used, use a zero vector
   // to break register dependency.
   // TODO: use undef instead and let BreakFalseDeps deal with it?
   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, Mask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
   return DAG.getMergeValues(RetOps, dl);
 }
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Src, SDValue Mask, SDValue Base,
                                SDValue Index, SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
   if (!C)
     return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
                               Src.getSimpleValueType().getVectorNumElements());
   MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
 
   // We support two versions of the scatter intrinsics. One with scalar mask and
   // one with vXi1 mask. Convert scalar to vXi1 if necessary.
   if (Mask.getValueType() != MaskVT)
     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Src, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   return SDValue(Res, 1);
 }
 
 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                                SDValue Mask, SDValue Base, SDValue Index,
                                SDValue ScaleOp, SDValue Chain,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(Op);
   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
   // Scale must be constant.
   if (!C)
     return SDValue();
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   return SDValue(Res, 0);
 }
 
 /// Handles the lowering of builtin intrinsic that return the value
 /// of the extended control register.
 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
                                        SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget,
                                        SmallVectorImpl<SDValue> &Results) {
   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue LO, HI;
 
   // The ECX register is used to select the index of the XCR register to
   // return.
   SDValue Chain =
       DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
   SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
   Chain = SDValue(N1, 0);
 
   // Reads the content of XCR and returns it in registers EDX:EAX.
   if (Subtarget.is64Bit()) {
     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
                             LO.getValue(2));
   } else {
     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
                             LO.getValue(2));
   }
   Chain = HI.getValue(1);
 
   if (Subtarget.is64Bit()) {
     // Merge the two 32-bit values into a 64-bit one..
     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
                               DAG.getConstant(32, DL, MVT::i8));
     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
     Results.push_back(Chain);
     return;
   }
 
   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   SDValue Ops[] = { LO, HI };
   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   Results.push_back(Pair);
   Results.push_back(Chain);
 }
 
 /// Handles the lowering of builtin intrinsics that read performance monitor
 /// counters (x86_rdpmc).
 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
                                       SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget,
                                       SmallVectorImpl<SDValue> &Results) {
   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue LO, HI;
 
   // The ECX register is used to select the index of the performance counter
   // to read.
   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
                                    N->getOperand(2));
   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
 
   // Reads the content of a 64-bit performance counter and returns it in the
   // registers EDX:EAX.
   if (Subtarget.is64Bit()) {
     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
                             LO.getValue(2));
   } else {
     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
                             LO.getValue(2));
   }
   Chain = HI.getValue(1);
 
   if (Subtarget.is64Bit()) {
     // The EAX register is loaded with the low-order 32 bits. The EDX register
     // is loaded with the supported high-order bits of the counter.
     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
                               DAG.getConstant(32, DL, MVT::i8));
     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
     Results.push_back(Chain);
     return;
   }
 
   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
   SDValue Ops[] = { LO, HI };
   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
   Results.push_back(Pair);
   Results.push_back(Chain);
 }
 
 /// Handles the lowering of builtin intrinsics that read the time stamp counter
 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
 /// READCYCLECOUNTER nodes.
 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
                                     SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget,
                                     SmallVectorImpl<SDValue> &Results) {
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
   SDValue LO, HI;
 
   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
   // and the EAX register is loaded with the low-order 32 bits.
   if (Subtarget.is64Bit()) {
     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
                             LO.getValue(2));
   } else {
     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
                             LO.getValue(2));
   }
   SDValue Chain = HI.getValue(1);
 
   SDValue TSC;
   if (Subtarget.is64Bit()) {
     // The EDX register is loaded with the high-order 32 bits of the MSR, and
     // the EAX register is loaded with the low-order 32 bits.
     TSC = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
                       DAG.getConstant(32, DL, MVT::i8));
     TSC = DAG.getNode(ISD::OR, DL, MVT::i64, LO, TSC);
   } else {
     // Use a buildpair to merge the two 32-bit values into a 64-bit one.
     TSC = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, { LO, HI });
   }
 
   if (Opcode == X86ISD::RDTSCP_DAG) {
     assert(N->getNumOperands() == 2 && "Unexpected number of operands!");
 
     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
     // the ECX register. Add 'ecx' explicitly to the chain.
     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
                                      HI.getValue(2));
 
     Results.push_back(TSC);
     Results.push_back(ecx);
     Results.push_back(ecx.getValue(1));
     return;
   }
 
   Results.push_back(TSC);
   Results.push_back(Chain);
 }
 
 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   SmallVector<SDValue, 3> Results;
   SDLoc DL(Op);
   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
                           Results);
   return DAG.getMergeValues(Results, DL);
 }
 
 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain = Op.getOperand(0);
   SDValue RegNode = Op.getOperand(2);
   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   if (!EHInfo)
     report_fatal_error("EH registrations only live in functions using WinEH");
 
   // Cast the operand to an alloca, and remember the frame index.
   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
   if (!FINode)
     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
 
   // Return the chain operand without making any DAG nodes.
   return Chain;
 }
 
 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
   SDValue Chain = Op.getOperand(0);
   SDValue EHGuard = Op.getOperand(2);
   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
   if (!EHInfo)
     report_fatal_error("EHGuard only live in functions using WinEH");
 
   // Cast the operand to an alloca, and remember the frame index.
   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
   if (!FINode)
     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
   EHInfo->EHGuardFrameIndex = FINode->getIndex();
 
   // Return the chain operand without making any DAG nodes.
   return Chain;
 }
 
 /// Emit Truncating Store with signed or unsigned saturation.
 static SDValue
 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
                 SelectionDAG &DAG) {
 
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
   SDValue Ops[] = { Chain, Val, Ptr, Undef };
   return SignedSat ?
     DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
     DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
 }
 
 /// Emit Masked Truncating Store with signed or unsigned saturation.
 static SDValue
 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
                       MachineMemOperand *MMO, SelectionDAG &DAG) {
 
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = { Chain, Val, Ptr, Mask };
   return SignedSat ?
     DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
     DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
 }
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 
   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
   if (!IntrData) {
     switch (IntNo) {
     case llvm::Intrinsic::x86_seh_ehregnode:
       return MarkEHRegistrationNode(Op, DAG);
     case llvm::Intrinsic::x86_seh_ehguard:
       return MarkEHGuard(Op, DAG);
     case llvm::Intrinsic::x86_flags_read_u32:
     case llvm::Intrinsic::x86_flags_read_u64:
     case llvm::Intrinsic::x86_flags_write_u32:
     case llvm::Intrinsic::x86_flags_write_u64: {
       // We need a frame pointer because this will get lowered to a PUSH/POP
       // sequence.
       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
       MFI.setHasCopyImplyingStackAdjustment(true);
       // Don't do anything here, we will expand these intrinsics out later
       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
       return SDValue();
     }
     case Intrinsic::x86_lwpins32:
     case Intrinsic::x86_lwpins64:
     case Intrinsic::x86_umwait:
     case Intrinsic::x86_tpause: {
       SDLoc dl(Op);
       SDValue Chain = Op->getOperand(0);
       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
       unsigned Opcode;
 
       switch (IntNo) {
       default: llvm_unreachable("Impossible intrinsic");
       case Intrinsic::x86_umwait:
         Opcode = X86ISD::UMWAIT;
         break;
       case Intrinsic::x86_tpause:
         Opcode = X86ISD::TPAUSE;
         break;
       case Intrinsic::x86_lwpins32:
       case Intrinsic::x86_lwpins64:
         Opcode = X86ISD::LWPINS;
         break;
       }
 
       SDValue Operation =
           DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
                       Op->getOperand(3), Op->getOperand(4));
       SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
       SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
                          Operation.getValue(1));
     }
     }
     return SDValue();
   }
 
   SDLoc dl(Op);
   switch(IntrData->Type) {
   default: llvm_unreachable("Unknown Intrinsic Type");
   case RDSEED:
   case RDRAND: {
     // Emit the node with the right value type.
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
 
     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
     // Otherwise return the value from Rand, which is always 0, casted to i32.
     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
                       DAG.getConstant(1, dl, Op->getValueType(1)),
                       DAG.getConstant(X86::COND_B, dl, MVT::i8),
                       SDValue(Result.getNode(), 1) };
     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
 
     // Return { result, isValid, chain }.
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
   case GATHER_AVX2: {
     SDValue Chain = Op.getOperand(0);
     SDValue Src   = Op.getOperand(2);
     SDValue Base  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
     SDValue Mask  = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
                              Scale, Chain, Subtarget);
   }
   case GATHER: {
   //gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
     SDValue Src   = Op.getOperand(2);
     SDValue Base  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
     SDValue Mask  = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
                          Chain, Subtarget);
   }
   case SCATTER: {
   //scatter(base, mask, index, v1, scale);
     SDValue Chain = Op.getOperand(0);
     SDValue Base  = Op.getOperand(2);
     SDValue Mask  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
     SDValue Src   = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
                           Scale, Chain, Subtarget);
   }
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
     assert((HintVal == 2 || HintVal == 3) &&
            "Wrong prefetch hint in intrinsic: should be 2 or 3");
     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
     SDValue Chain = Op.getOperand(0);
     SDValue Mask  = Op.getOperand(2);
     SDValue Index = Op.getOperand(3);
     SDValue Base  = Op.getOperand(4);
     SDValue Scale = Op.getOperand(5);
     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
                            Subtarget);
   }
   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
   case RDTSC: {
     SmallVector<SDValue, 2> Results;
     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
                             Results);
     return DAG.getMergeValues(Results, dl);
   }
   // Read Performance Monitoring Counters.
   case RDPMC: {
     SmallVector<SDValue, 2> Results;
     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
     return DAG.getMergeValues(Results, dl);
   }
   // Get Extended Control Register.
   case XGETBV: {
     SmallVector<SDValue, 2> Results;
     getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
     return DAG.getMergeValues(Results, dl);
   }
   // XTEST intrinsics.
   case XTEST: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
 
     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
                        Ret, SDValue(InTrans.getNode(), 1));
   }
   case TRUNCATE_TO_MEM_VI8:
   case TRUNCATE_TO_MEM_VI16:
   case TRUNCATE_TO_MEM_VI32: {
     SDValue Mask = Op.getOperand(4);
     SDValue DataToTruncate = Op.getOperand(3);
     SDValue Addr = Op.getOperand(2);
     SDValue Chain = Op.getOperand(0);
 
     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
     assert(MemIntr && "Expected MemIntrinsicSDNode!");
 
     EVT MemVT  = MemIntr->getMemoryVT();
 
     uint16_t TruncationOp = IntrData->Opc0;
     switch (TruncationOp) {
     case X86ISD::VTRUNC: {
       if (isAllOnesConstant(Mask)) // return just a truncate store
         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
                                  MemIntr->getMemOperand());
 
       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
                                 MemIntr->getMemOperand(), true /* truncating */);
     }
     case X86ISD::VTRUNCUS:
     case X86ISD::VTRUNCS: {
       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
       if (isAllOnesConstant(Mask))
         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
                                MemIntr->getMemOperand(), DAG);
 
       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
     }
     default:
       llvm_unreachable("Unsupported truncstore intrinsic");
     }
   }
   }
 }
 
 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setReturnAddressIsTaken(true);
 
   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
                        MachinePointerInfo());
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
                      MachinePointerInfo());
 }
 
 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
   return getReturnAddressFrameIndex(DAG);
 }
 
 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   EVT VT = Op.getValueType();
 
   MFI.setFrameAddressIsTaken(true);
 
   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
     // is not possible to crawl up the stack without looking at the unwind codes
     // simultaneously.
     int FrameAddrIndex = FuncInfo->getFAIndex();
     if (!FrameAddrIndex) {
       // Set up a frame object for the return address.
       unsigned SlotSize = RegInfo->getSlotSize();
       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
       FuncInfo->setFAIndex(FrameAddrIndex);
     }
     return DAG.getFrameIndex(FrameAddrIndex, VT);
   }
 
   unsigned FrameReg =
       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
          "Invalid Frame Register!");
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo());
   return FrameAddr;
 }
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                               SelectionDAG &DAG) const {
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   const MachineFunction &MF = DAG.getMachineFunction();
 
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("esp", X86::ESP)
                        .Case("rsp", X86::RSP)
                        .Case("ebp", X86::EBP)
                        .Case("rbp", X86::RBP)
                        .Default(0);
 
   if (Reg == X86::EBP || Reg == X86::RBP) {
     if (!TFI.hasFP(MF))
       report_fatal_error("register " + StringRef(RegName) +
                          " is allocatable: function has no frame pointer");
 #ifndef NDEBUG
     else {
       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
       unsigned FrameReg =
           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
              "Invalid Frame Register!");
     }
 #endif
   }
 
   if (Reg)
     return Reg;
 
   report_fatal_error("Invalid register name global variable");
 }
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
 }
 
 unsigned X86TargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
 
   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
 }
 
 unsigned X86TargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   // Funclet personalities don't use selectors (the runtime does the selection).
   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
 }
 
 bool X86TargetLowering::needsFixedCatchObjects() const {
   return Subtarget.isTargetWin64();
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   SDLoc dl      (Op);
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
          "Invalid Frame Register!");
   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
 
   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
                                                        dl));
   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
 
   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
                      DAG.getRegister(StoreAddrReg, PtrVT));
 }
 
 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
   // If the subtarget is not 64bit, we may need the global base reg
   // after isel expand pseudo, i.e., after CGBR pass ran.
   // Therefore, ask for the GlobalBaseReg now, so that the pass
   // inserts the code for us in case we need it.
   // Otherwise, we will end up in a situation where we will
   // reference a virtual register that is not defined!
   if (!Subtarget.is64Bit()) {
     const X86InstrInfo *TII = Subtarget.getInstrInfo();
     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
   }
   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other),
                      Op.getOperand(0), Op.getOperand(1));
 }
 
 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
                      Op.getOperand(0), Op.getOperand(1));
 }
 
 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
                                                        SelectionDAG &DAG) const {
   SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
                      Op.getOperand(0));
 }
 
 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   return Op.getOperand(0);
 }
 
 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDValue Root = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
   SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   if (Subtarget.is64Bit()) {
     SDValue OutChains[6];
 
     // Large code-model.
     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
 
     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
 
     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
 
     // Load the pointer to the nested function into R11.
     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
     SDValue Addr = Trmp;
     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
                                 Addr, MachinePointerInfo(TrmpAddr));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(2, dl, MVT::i64));
     OutChains[1] =
         DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
                      /* Alignment = */ 2);
 
     // Load the 'nest' parameter value into R10.
     // R10 is specified in X86CallingConv.td
     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(10, dl, MVT::i64));
     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
                                 Addr, MachinePointerInfo(TrmpAddr, 10));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(12, dl, MVT::i64));
     OutChains[3] =
         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
                      /* Alignment = */ 2);
 
     // Jump to the nested function.
     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(20, dl, MVT::i64));
     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
                                 Addr, MachinePointerInfo(TrmpAddr, 20));
 
     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
                        DAG.getConstant(22, dl, MVT::i64));
     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
                                 Addr, MachinePointerInfo(TrmpAddr, 22));
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   } else {
     const Function *Func =
       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
     CallingConv::ID CC = Func->getCallingConv();
     unsigned NestReg;
 
     switch (CC) {
     default:
       llvm_unreachable("Unsupported calling convention");
     case CallingConv::C:
     case CallingConv::X86_StdCall: {
       // Pass 'nest' parameter in ECX.
       // Must be kept in sync with X86CallingConv.td
       NestReg = X86::ECX;
 
       // Check that ECX wasn't needed by an 'inreg' parameter.
       FunctionType *FTy = Func->getFunctionType();
       const AttributeList &Attrs = Func->getAttributes();
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
         unsigned InRegCount = 0;
         unsigned Idx = 1;
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
              E = FTy->param_end(); I != E; ++I, ++Idx)
           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
             auto &DL = DAG.getDataLayout();
             // FIXME: should only count parameters that are lowered to integers.
             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
           }
 
         if (InRegCount > 2) {
           report_fatal_error("Nest register in use - reduce number of inreg"
                              " parameters!");
         }
       }
       break;
     }
     case CallingConv::X86_FastCall:
     case CallingConv::X86_ThisCall:
     case CallingConv::Fast:
       // Pass 'nest' parameter in EAX.
       // Must be kept in sync with X86CallingConv.td
       NestReg = X86::EAX;
       break;
     }
 
     SDValue OutChains[4];
     SDValue Addr, Disp;
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(10, dl, MVT::i32));
     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
 
     // This is storing the opcode for MOV32ri.
     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
     OutChains[0] =
         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
                      Trmp, MachinePointerInfo(TrmpAddr));
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(1, dl, MVT::i32));
     OutChains[1] =
         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
                      /* Alignment = */ 1);
 
     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(5, dl, MVT::i32));
     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
                                 Addr, MachinePointerInfo(TrmpAddr, 5),
                                 /* Alignment = */ 1);
 
     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
                        DAG.getConstant(6, dl, MVT::i32));
     OutChains[3] =
         DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
                      /* Alignment = */ 1);
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   }
 }
 
 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                             SelectionDAG &DAG) const {
   /*
    The rounding mode is in bits 11:10 of FPSR, and has the following
    settings:
      00 Round to nearest
      01 Round to -inf
      10 Round to +inf
      11 Round to 0
 
   FLT_ROUNDS, on the other hand, expects the following:
     -1 Undefined
      0 Round to 0
      1 Round to nearest
      2 Round to +inf
      3 Round to -inf
 
   To perform the conversion, we do:
     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
   */
 
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
 
   // Save FP Control Word to stack slot
   int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
   SDValue StackSlot =
       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
 
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
                               MachineMemOperand::MOStore, 2, 2);
 
   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
                                           DAG.getVTList(MVT::Other),
                                           Ops, MVT::i16, MMO);
 
   // Load FP Control Word from stack slot
   SDValue CWD =
       DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
 
   // Transform as necessary
   SDValue CWD1 =
     DAG.getNode(ISD::SRL, DL, MVT::i16,
                 DAG.getNode(ISD::AND, DL, MVT::i16,
                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
                 DAG.getConstant(11, DL, MVT::i8));
   SDValue CWD2 =
     DAG.getNode(ISD::SRL, DL, MVT::i16,
                 DAG.getNode(ISD::AND, DL, MVT::i16,
                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
                 DAG.getConstant(9, DL, MVT::i8));
 
   SDValue RetVal =
     DAG.getNode(ISD::AND, DL, MVT::i16,
                 DAG.getNode(ISD::ADD, DL, MVT::i16,
                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
                             DAG.getConstant(1, DL, MVT::i16)),
                 DAG.getConstant(3, DL, MVT::i16));
 
   return DAG.getNode((VT.getSizeInBits() < 16 ?
                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
 }
 
 // Split an unary integer op into 2 half sized ops.
 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumElems = VT.getVectorNumElements();
   unsigned SizeInBits = VT.getSizeInBits();
   MVT EltVT = VT.getVectorElementType();
   SDValue Src = Op.getOperand(0);
   assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
          "Src and Op should have the same element type!");
 
   // Extract the Lo/Hi vectors
   SDLoc dl(Op);
   SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
   SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
 
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
 }
 
 // Decompose 256-bit ops into smaller 128-bit ops.
 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return LowerVectorIntUnary(Op, DAG);
 }
 
 // Decompose 512-bit ops into smaller 256-bit ops.
 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().is512BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 512-bit vector integer operation");
   return LowerVectorIntUnary(Op, DAG);
 }
 
 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
 //
 // i8/i16 vector implemented using dword LZCNT vector instruction
 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
 // split the vector, perform operation on it's Lo a Hi part and
 // concatenate the results.
 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   assert(Op.getOpcode() == ISD::CTLZ);
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
           "Unsupported element type");
 
   // Split vector, it's Lo and Hi parts will be handled in next iteration.
   if (NumElems > 16 ||
       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
     return LowerVectorIntUnary(Op, DAG);
 
   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
           "Unsupported value type for operation");
 
   // Use native supported vector instruction vplzcntd.
   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
 
   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
 }
 
 // Lower CTLZ using a PSHUFB lookup table implementation.
 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   int NumElts = VT.getVectorNumElements();
   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
 
   // Per-nibble leading zero PSHUFB lookup table.
   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
 
   SmallVector<SDValue, 64> LUTVec;
   for (int i = 0; i < NumBytes; ++i)
     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
 
   // Begin by bitcasting the input to byte vector, then split those bytes
   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
   // If the hi input nibble is zero then we add both results together, otherwise
   // we just take the hi result (by masking the lo result to zero before the
   // add).
   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
   SDValue Zero = DAG.getConstant(0, DL, CurrVT);
 
   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
   SDValue Lo = Op0;
   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
   SDValue HiZ;
   if (CurrVT.is512BitVector()) {
     MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
     HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
     HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
   } else {
     HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
   }
 
   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
 
   // Merge result back from vXi8 back to VT, working on the lo/hi halves
   // of the current vector width in the same way we did for the nibbles.
   // If the upper half of the input element is zero then add the halves'
   // leading zero counts together, otherwise just use the upper half's.
   // Double the width of the result until we are at target width.
   while (CurrVT != VT) {
     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
     int CurrNumElts = CurrVT.getVectorNumElements();
     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
 
     // Check if the upper half of the input element is zero.
     if (CurrVT.is512BitVector()) {
       MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
       HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
       HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
     } else {
       HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
                          DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
     }
     HiZ = DAG.getBitcast(NextVT, HiZ);
 
     // Move the upper/lower halves to the lower bits as we'll be extending to
     // NextVT. Mask the lower result to zero if HiZ is true and add the results
     // together.
     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
     CurrVT = NextVT;
   }
 
   return Res;
 }
 
 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
                                const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   if (Subtarget.hasCDI() &&
       // vXi8 vectors need to be promoted to 512-bits for vXi32.
       (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
     return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
 
   // Decompose 512-bit ops into smaller 256-bit ops.
   if (VT.is512BitVector() && !Subtarget.hasBWI())
     return Lower512IntUnary(Op, DAG);
 
   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
 }
 
 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
                          SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
   SDLoc dl(Op);
   unsigned Opc = Op.getOpcode();
 
   if (VT.isVector())
     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
     // Zero extend to i32 since there is not an i8 bsr.
     OpVT = MVT::i32;
     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
   }
 
   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
 
   if (Opc == ISD::CTLZ) {
     // If src is zero (i.e. bsr sets ZF), returns NumBits.
     SDValue Ops[] = {
       Op,
       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
       DAG.getConstant(X86::COND_E, dl, MVT::i8),
       Op.getValue(1)
     };
     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
   }
 
   // Finally xor with NumBits-1.
   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
                    DAG.getConstant(NumBits - 1, dl, OpVT));
 
   if (VT == MVT::i8)
     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
   return Op;
 }
 
 static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
                          SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   unsigned NumBits = VT.getScalarSizeInBits();
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
 
   assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
          "Only scalar CTTZ requires custom lowering");
 
   // Issue a bsf (scan bits forward) which also sets EFLAGS.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
 
   // If src is zero (i.e. bsf sets ZF), returns NumBits.
   SDValue Ops[] = {
     Op,
     DAG.getConstant(NumBits, dl, VT),
     DAG.getConstant(X86::COND_E, dl, MVT::i8),
     Op.getValue(1)
   };
   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
 }
 
 /// Break a 256-bit integer operation into two new 128-bit ones and then
 /// concatenate the result back.
 static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   assert(VT.is256BitVector() && VT.isInteger() &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
   SDLoc dl(Op);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
 
   MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
 /// Break a 512-bit integer operation into two new 256-bit ones and then
 /// concatenate the result back.
 static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   assert(VT.is512BitVector() && VT.isInteger() &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
   SDLoc dl(Op);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
 
   // Extract the RHS vectors
   SDValue RHS = Op.getOperand(1);
   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
 
   MVT EltVT = VT.getVectorElementType();
   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   if (VT == MVT::i16 || VT == MVT::i32)
     return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
 
   if (VT.getScalarType() == MVT::i1)
     return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
                        Op.getOperand(0), Op.getOperand(1));
 
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return split256IntArith(Op, DAG);
 }
 
 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT.getScalarType() == MVT::i1) {
     SDLoc dl(Op);
     switch (Op.getOpcode()) {
     default: llvm_unreachable("Expected saturated arithmetic opcode");
     case ISD::UADDSAT:
     case ISD::SADDSAT:
       return DAG.getNode(ISD::OR, dl, VT, Op.getOperand(0), Op.getOperand(1));
     case ISD::USUBSAT:
     case ISD::SSUBSAT:
       return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
                          DAG.getNOT(dl, Op.getOperand(1), VT));
     }
   }
 
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return split256IntArith(Op, DAG);
 }
 
 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
     // Since X86 does not have CMOV for 8-bit integer, we don't convert
     // 8-bit integer abs to NEG and CMOV.
     SDLoc DL(Op);
     SDValue N0 = Op.getOperand(0);
     SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
                               DAG.getConstant(0, DL, VT), N0);
     SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
                      SDValue(Neg.getNode(), 1)};
     return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
   }
 
   // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
   if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
     SDLoc DL(Op);
     SDValue Src = Op.getOperand(0);
     SDValue Sub =
         DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);
     return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);
   }
 
   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
     assert(VT.isInteger() &&
            "Only handle AVX 256-bit vector integer operation");
     return Lower256IntUnary(Op, DAG);
   }
 
   // Default to expand.
   return SDValue();
 }
 
 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   // For AVX1 cases, split to use legal ops (everything but v4i64).
   if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
     return split256IntArith(Op, DAG);
 
   SDLoc DL(Op);
   unsigned Opcode = Op.getOpcode();
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
 
   // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
   // using the SMIN/SMAX instructions and flipping the signbit back.
   if (VT == MVT::v8i16) {
     assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
            "Unexpected MIN/MAX opcode");
     SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
     N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
     N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
     Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
     SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
     return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
   }
 
   // Else, expand to a compare/select.
   ISD::CondCode CC;
   switch (Opcode) {
   case ISD::SMIN: CC = ISD::CondCode::SETLT;  break;
   case ISD::SMAX: CC = ISD::CondCode::SETGT;  break;
   case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
   case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
   default: llvm_unreachable("Unknown MINMAX opcode");
   }
 
   SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
   return DAG.getSelect(DL, VT, Cond, N0, N1);
 }
 
 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
                         SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
   if (VT.getScalarType() == MVT::i1)
     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
 
   // Decompose 256-bit ops into 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return split256IntArith(Op, DAG);
 
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
   // vector pairs, multiply and truncate.
   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
     unsigned NumElts = VT.getVectorNumElements();
 
     if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
         (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
       return DAG.getNode(
           ISD::TRUNCATE, dl, VT,
           DAG.getNode(ISD::MUL, dl, ExVT,
                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
                       DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
     }
 
     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
     // Extract the lo/hi parts to any extend to i16.
     // We're going to mask off the low byte of each result element of the
     // pmullw, so it doesn't matter what's in the high byte of each 16-bit
     // element.
     SDValue Undef = DAG.getUNDEF(VT);
     SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
     SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
 
     SDValue BLo, BHi;
     if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
       // If the LHS is a constant, manually unpackl/unpackh.
       SmallVector<SDValue, 16> LoOps, HiOps;
       for (unsigned i = 0; i != NumElts; i += 16) {
         for (unsigned j = 0; j != 8; ++j) {
           LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
                                                MVT::i16));
           HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
                                                MVT::i16));
         }
       }
 
       BLo = DAG.getBuildVector(ExVT, dl, LoOps);
       BHi = DAG.getBuildVector(ExVT, dl, HiOps);
     } else {
       BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
       BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
     }
 
     // Multiply, mask the lower 8bits of the lo/hi results and pack.
     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
   }
 
   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
   if (VT == MVT::v4i32) {
     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
            "Should not custom lower when pmulld is available!");
 
     // Extract the odd parts.
     static const int UnpackMask[] = { 1, -1, 3, -1 };
     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
 
     // Multiply the even parts.
     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
                                 DAG.getBitcast(MVT::v2i64, A),
                                 DAG.getBitcast(MVT::v2i64, B));
     // Now multiply odd parts.
     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
                                DAG.getBitcast(MVT::v2i64, Aodds),
                                DAG.getBitcast(MVT::v2i64, Bodds));
 
     Evens = DAG.getBitcast(VT, Evens);
     Odds = DAG.getBitcast(VT, Odds);
 
     // Merge the two vectors back together with a shuffle. This expands into 2
     // shuffles.
     static const int ShufMask[] = { 0, 4, 2, 6 };
     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   }
 
   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
          "Only know how to lower V2I64/V4I64/V8I64 multiply");
   assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
 
   //  Ahi = psrlqi(a, 32);
   //  Bhi = psrlqi(b, 32);
   //
   //  AloBlo = pmuludq(a, b);
   //  AloBhi = pmuludq(a, Bhi);
   //  AhiBlo = pmuludq(Ahi, b);
   //
   //  Hi = psllqi(AloBhi + AhiBlo, 32);
   //  return AloBlo + Hi;
   KnownBits AKnown = DAG.computeKnownBits(A);
   KnownBits BKnown = DAG.computeKnownBits(B);
 
   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
   bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
   bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
 
   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
   bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
   bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
 
   SDValue Zero = DAG.getConstant(0, dl, VT);
 
   // Only multiply lo/hi halves that aren't known to be zero.
   SDValue AloBlo = Zero;
   if (!ALoIsZero && !BLoIsZero)
     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
 
   SDValue AloBhi = Zero;
   if (!ALoIsZero && !BHiIsZero) {
     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   }
 
   SDValue AhiBlo = Zero;
   if (!AHiIsZero && !BLoIsZero) {
     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
   }
 
   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
 
   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
 }
 
 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                          SelectionDAG &DAG) {
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   bool IsSigned = Op->getOpcode() == ISD::MULHS;
   unsigned NumElts = VT.getVectorNumElements();
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
   // Decompose 256-bit ops into 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return split256IntArith(Op, DAG);
 
   if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
     assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
            (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
            (VT == MVT::v16i32 && Subtarget.hasAVX512()));
 
     // PMULxD operations multiply each even value (starting at 0) of LHS with
     // the related value of RHS and produce a widen result.
     // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
     // => <2 x i64> <ae|cg>
     //
     // In other word, to have all the results, we need to perform two PMULxD:
     // 1. one with the even values.
     // 2. one with the odd values.
     // To achieve #2, with need to place the odd values at an even position.
     //
     // Place the odd value at an even position (basically, shift all values 1
     // step to the left):
     const int Mask[] = {1, -1,  3, -1,  5, -1,  7, -1,
                         9, -1, 11, -1, 13, -1, 15, -1};
     // <a|b|c|d> => <b|undef|d|undef>
     SDValue Odd0 = DAG.getVectorShuffle(VT, dl, A, A,
                                         makeArrayRef(&Mask[0], NumElts));
     // <e|f|g|h> => <f|undef|h|undef>
     SDValue Odd1 = DAG.getVectorShuffle(VT, dl, B, B,
                                         makeArrayRef(&Mask[0], NumElts));
 
     // Emit two multiplies, one for the lower 2 ints and one for the higher 2
     // ints.
     MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
     unsigned Opcode =
         (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
     // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
     // => <2 x i64> <ae|cg>
     SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
                                                   DAG.getBitcast(MulVT, A),
                                                   DAG.getBitcast(MulVT, B)));
     // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
     // => <2 x i64> <bf|dh>
     SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
                                                   DAG.getBitcast(MulVT, Odd0),
                                                   DAG.getBitcast(MulVT, Odd1)));
 
     // Shuffle it back into the right order.
     SmallVector<int, 16> ShufMask(NumElts);
     for (int i = 0; i != (int)NumElts; ++i)
       ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
 
     SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
 
     // If we have a signed multiply but no PMULDQ fix up the result of an
     // unsigned multiply.
     if (IsSigned && !Subtarget.hasSSE41()) {
       SDValue Zero = DAG.getConstant(0, dl, VT);
       SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
                                DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
       SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
                                DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
 
       SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
     }
 
     return Res;
   }
 
   // Only i8 vectors should need custom lowering after this.
   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
          (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
          "Unsupported vector type");
 
   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
   // logical shift down the upper half and pack back to i8.
 
   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
   // and then ashr/lshr the upper bits down to the lower bits before multiply.
   unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
 
   if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
       (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
     SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
     SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
     SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
     Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
   }
 
   // For signed 512-bit vectors, split into 256-bit vectors to allow the
   // sign-extension to occur.
   if (VT == MVT::v64i8 && IsSigned)
     return split512IntArith(Op, DAG);
 
   // Signed AVX2 implementation - extend xmm subvectors to ymm.
   if (VT == MVT::v32i8 && IsSigned) {
     SDValue Lo = DAG.getIntPtrConstant(0, dl);
     SDValue Hi = DAG.getIntPtrConstant(NumElts / 2, dl);
 
     MVT ExVT = MVT::v16i16;
     SDValue ALo = extract128BitVector(A, 0, DAG, dl);
     SDValue BLo = extract128BitVector(B, 0, DAG, dl);
     SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
     SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
     ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
     BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
     AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
     BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
     Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
     Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
     Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
     Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
 
     // Bitcast back to VT and then pack all the even elements from Lo and Hi.
     // Shuffle lowering should turn this into PACKUS+PERMQ
     Lo = DAG.getBitcast(VT, Lo);
     Hi = DAG.getBitcast(VT, Hi);
     return DAG.getVectorShuffle(VT, dl, Lo, Hi,
                                 { 0,  2,  4,  6,  8, 10, 12, 14,
                                  16, 18, 20, 22, 24, 26, 28, 30,
                                  32, 34, 36, 38, 40, 42, 44, 46,
                                  48, 50, 52, 54, 56, 58, 60, 62});
   }
 
   // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
   // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
   // shift the results and pack the half lane results back together.
 
   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
   static const int PSHUFDMask[] = { 8,  9, 10, 11, 12, 13, 14, 15,
                                    -1, -1, -1, -1, -1, -1, -1, -1};
 
   // Extract the lo parts and zero/sign extend to i16.
   // Only use SSE4.1 instructions for signed v16i8 where using unpack requires
   // shifts to sign extend. Using unpack for unsigned only requires an xor to
   // create zeros and a copy due to tied registers contraints pre-avx. But using
   // zero_extend_vector_inreg would require an additional pshufd for the high
   // part.
 
   SDValue ALo, AHi;
   if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
     ALo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, A);
 
     AHi = DAG.getVectorShuffle(VT, dl, A, A, PSHUFDMask);
     AHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, AHi);
   } else if (IsSigned) {
     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), A));
     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), A));
 
     ALo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, ALo, 8, DAG);
     AHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, AHi, 8, DAG);
   } else {
     ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A,
                                           DAG.getConstant(0, dl, VT)));
     AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A,
                                           DAG.getConstant(0, dl, VT)));
   }
 
   SDValue BLo, BHi;
   if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
     // If the LHS is a constant, manually unpackl/unpackh and extend.
     SmallVector<SDValue, 16> LoOps, HiOps;
     for (unsigned i = 0; i != NumElts; i += 16) {
       for (unsigned j = 0; j != 8; ++j) {
         SDValue LoOp = B.getOperand(i + j);
         SDValue HiOp = B.getOperand(i + j + 8);
 
         if (IsSigned) {
           LoOp = DAG.getSExtOrTrunc(LoOp, dl, MVT::i16);
           HiOp = DAG.getSExtOrTrunc(HiOp, dl, MVT::i16);
         } else {
           LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
           HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
         }
 
         LoOps.push_back(LoOp);
         HiOps.push_back(HiOp);
       }
     }
 
     BLo = DAG.getBuildVector(ExVT, dl, LoOps);
     BHi = DAG.getBuildVector(ExVT, dl, HiOps);
   } else if (IsSigned && VT == MVT::v16i8 && Subtarget.hasSSE41()) {
     BLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, B);
 
     BHi = DAG.getVectorShuffle(VT, dl, B, B, PSHUFDMask);
     BHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, ExVT, BHi);
   } else if (IsSigned) {
     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), B));
     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), B));
 
     BLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BLo, 8, DAG);
     BHi = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, BHi, 8, DAG);
   } else {
     BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B,
                                           DAG.getConstant(0, dl, VT)));
     BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B,
                                           DAG.getConstant(0, dl, VT)));
   }
 
   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
   // pack back to vXi8.
   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
   RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RLo, 8, DAG);
   RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, RHi, 8, DAG);
 
   // Bitcast back to VT and then pack all the even elements from Lo and Hi.
   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
 }
 
 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget.isTargetWin64() && "Unexpected target");
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
          "Unexpected return type for lowering");
 
   RTLIB::Libcall LC;
   bool isSigned;
   switch (Op->getOpcode()) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
   }
 
   SDLoc dl(Op);
   SDValue InChain = DAG.getEntryNode();
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
     EVT ArgVT = Op->getOperand(i).getValueType();
     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
            "Unexpected argument type for lowering");
     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
     Entry.Node = StackPtr;
     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
                            MachinePointerInfo(), /* Alignment = */ 16);
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Ty = PointerType::get(ArgTy,0);
     Entry.IsSExt = false;
     Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(InChain)
       .setLibCallee(
           getLibcallCallingConv(LC),
           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
           std::move(Args))
       .setInRegister()
       .setSExtResult(isSigned)
       .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return DAG.getBitcast(VT, CallInfo.first);
 }
 
 // Return true if the required (according to Opcode) shift-imm form is natively
 // supported by the Subtarget
 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
                                         unsigned Opcode) {
   if (VT.getScalarSizeInBits() < 16)
     return false;
 
   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
     return true;
 
   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
                 (VT.is256BitVector() && Subtarget.hasInt256());
 
   bool AShift = LShift && (Subtarget.hasAVX512() ||
                            (VT != MVT::v2i64 && VT != MVT::v4i64));
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
 // The shift amount is a variable, but it is the same for all vector lanes.
 // These instructions are defined together with shift-immediate.
 static
 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
                                       unsigned Opcode) {
   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
 }
 
 // Return true if the required (according to Opcode) variable-shift form is
 // natively supported by the Subtarget
 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
                                     unsigned Opcode) {
 
   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
     return false;
 
   // vXi16 supported only on AVX-512, BWI
   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
     return false;
 
   if (Subtarget.hasAVX512())
     return true;
 
   bool LShift = VT.is128BitVector() || VT.is256BitVector();
   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
 
   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
     SDValue Ex = DAG.getBitcast(ExVT, R);
 
     // ashr(R, 63) === cmp_slt(R, 0)
     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
              "Unsupported PCMPGT op");
       return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
     }
 
     if (ShiftAmt >= 32) {
       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
       SDValue Upper =
           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
                                                  ShiftAmt - 32, DAG);
       if (VT == MVT::v2i64)
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
       if (VT == MVT::v4i64)
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
                                   {9, 1, 11, 3, 13, 5, 15, 7});
     } else {
       // SRA upper i32, SRL whole i64 and select lower i32.
       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
                                                  ShiftAmt, DAG);
       SDValue Lower =
           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
       Lower = DAG.getBitcast(ExVT, Lower);
       if (VT == MVT::v2i64)
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
       if (VT == MVT::v4i64)
         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
                                   {8, 1, 10, 3, 12, 5, 14, 7});
     }
     return DAG.getBitcast(VT, Ex);
   };
 
   // Optimize shl/srl/sra with constant shift amount.
   APInt APIntShiftAmt;
   if (!isConstantSplat(Amt, APIntShiftAmt))
     return SDValue();
   uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
 
   if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
     return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
 
   // i64 SRA needs to be performed as partial shifts.
   if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
        (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
       Op.getOpcode() == ISD::SRA)
     return ArithmeticShiftRight64(ShiftAmt);
 
   if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
       VT == MVT::v64i8) {
     unsigned NumElts = VT.getVectorNumElements();
     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
     // Simple i8 add case
     if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
       return DAG.getNode(ISD::ADD, dl, VT, R, R);
 
     // ashr(R, 7)  === cmp_slt(R, 0)
     if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
       SDValue Zeros = DAG.getConstant(0, dl, VT);
       if (VT.is512BitVector()) {
         assert(VT == MVT::v64i8 && "Unexpected element type!");
         SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
         return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
       }
       return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
     }
 
     // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
     if (VT == MVT::v16i8 && Subtarget.hasXOP())
       return SDValue();
 
     if (Op.getOpcode() == ISD::SHL) {
       // Make a large shift.
       SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
                                                ShiftAmt, DAG);
       SHL = DAG.getBitcast(VT, SHL);
       // Zero out the rightmost bits.
       return DAG.getNode(ISD::AND, dl, VT, SHL,
                          DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
     }
     if (Op.getOpcode() == ISD::SRL) {
       // Make a large shift.
       SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
                                                ShiftAmt, DAG);
       SRL = DAG.getBitcast(VT, SRL);
       // Zero out the leftmost bits.
       return DAG.getNode(ISD::AND, dl, VT, SRL,
                          DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
     }
     if (Op.getOpcode() == ISD::SRA) {
       // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
       SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
 
       SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
       Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
       Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
       return Res;
     }
     llvm_unreachable("Unknown shift opcode.");
   }
 
   return SDValue();
 }
 
 // If V is a splat value, return the source vector and splat index;
 static SDValue IsSplatVector(SDValue V, int &SplatIdx, SelectionDAG &DAG) {
   V = peekThroughEXTRACT_SUBVECTORs(V);
 
   EVT VT = V.getValueType();
   unsigned Opcode = V.getOpcode();
   switch (Opcode) {
   default: {
     APInt UndefElts;
     APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
     if (DAG.isSplatValue(V, DemandedElts, UndefElts)) {
       // Handle case where all demanded elements are UNDEF.
       if (DemandedElts.isSubsetOf(UndefElts)) {
         SplatIdx = 0;
         return DAG.getUNDEF(VT);
       }
       SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
       return V;
     }
     break;
   }
   case ISD::VECTOR_SHUFFLE: {
     // Check if this is a shuffle node doing a splat.
     // TODO - remove this and rely purely on SelectionDAG::isSplatValue,
     // getTargetVShiftNode currently struggles without the splat source.
     auto *SVN = cast<ShuffleVectorSDNode>(V);
     if (!SVN->isSplat())
       break;
     int Idx = SVN->getSplatIndex();
     int NumElts = V.getValueType().getVectorNumElements();
     SplatIdx = Idx % NumElts;
     return V.getOperand(Idx / NumElts);
   }
   }
 
   return SDValue();
 }
 
 static SDValue GetSplatValue(SDValue V, const SDLoc &dl,
                              SelectionDAG &DAG) {
   int SplatIdx;
   if (SDValue SrcVector = IsSplatVector(V, SplatIdx, DAG))
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                        SrcVector.getValueType().getScalarType(), SrcVector,
                        DAG.getIntPtrConstant(SplatIdx, dl));
   return SDValue();
 }
 
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   unsigned Opcode = Op.getOpcode();
   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opcode, true);
 
   if (SDValue BaseShAmt = GetSplatValue(Amt, dl, DAG)) {
     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
       MVT EltVT = VT.getVectorElementType();
       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
       else if (EltVT.bitsLT(MVT::i32))
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
 
       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
     }
 
     // vXi8 shifts - shift as v8i16 + mask result.
     if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
          (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
          VT == MVT::v64i8) &&
         !Subtarget.hasXOP()) {
       unsigned NumElts = VT.getVectorNumElements();
       MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
       if (SupportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
         unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
         unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
 
         // Create the mask using vXi16 shifts. For shift-rights we need to move
         // the upper byte down before splatting the vXi8 mask.
         SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
         BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
                                       BaseShAmt, Subtarget, DAG);
         if (Opcode != ISD::SHL)
           BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
                                                8, DAG);
         BitMask = DAG.getBitcast(VT, BitMask);
         BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
                                        SmallVector<int, 64>(NumElts, 0));
 
         SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
                                           DAG.getBitcast(ExtVT, R), BaseShAmt,
                                           Subtarget, DAG);
         Res = DAG.getBitcast(VT, Res);
         Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
 
         if (Opcode == ISD::SRA) {
           // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
           // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
           SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
           SignMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask,
                                          BaseShAmt, Subtarget, DAG);
           SignMask = DAG.getBitcast(VT, SignMask);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
         }
         return Res;
       }
     }
   }
 
   // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
   if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
     unsigned Ratio = 64 / Amt.getScalarValueSizeInBits();
     std::vector<SDValue> Vals(Ratio);
     for (unsigned i = 0; i != Ratio; ++i)
       Vals[i] = Amt.getOperand(i);
     for (unsigned i = Ratio, e = Amt.getNumOperands(); i != e; i += Ratio) {
       for (unsigned j = 0; j != Ratio; ++j)
         if (Vals[j] != Amt.getOperand(i + j))
           return SDValue();
     }
 
     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
   }
   return SDValue();
 }
 
 // Convert a shift/rotate left amount to a multiplication scale factor.
 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   MVT VT = Amt.getSimpleValueType();
   if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
         (Subtarget.hasInt256() && VT == MVT::v16i16) ||
         (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
     return SDValue();
 
   if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
     SmallVector<SDValue, 8> Elts;
     MVT SVT = VT.getVectorElementType();
     unsigned SVTBits = SVT.getSizeInBits();
     APInt One(SVTBits, 1);
     unsigned NumElems = VT.getVectorNumElements();
 
     for (unsigned i = 0; i != NumElems; ++i) {
       SDValue Op = Amt->getOperand(i);
       if (Op->isUndef()) {
         Elts.push_back(Op);
         continue;
       }
 
       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
       uint64_t ShAmt = C.getZExtValue();
       if (ShAmt >= SVTBits) {
         Elts.push_back(DAG.getUNDEF(SVT));
         continue;
       }
       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
     }
     return DAG.getBuildVector(VT, dl, Elts);
   }
 
   // If the target doesn't support variable shifts, use either FP conversion
   // or integer multiplication to avoid shifting each element individually.
   if (VT == MVT::v4i32) {
     Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
                       DAG.getConstant(0x3f800000U, dl, VT));
     Amt = DAG.getBitcast(MVT::v4f32, Amt);
     return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
   }
 
   // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
   if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
     SDValue Z = DAG.getConstant(0, dl, VT);
     SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
     SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
     Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
     Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
     if (Subtarget.hasSSE41())
       return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
 
     return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
                                         DAG.getBitcast(VT, Hi),
                                         {0, 2, 4, 6, 8, 10, 12, 14});
   }
 
   return SDValue();
 }
 
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
 
   unsigned Opc = Op.getOpcode();
   unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
   unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
 
   assert(VT.isVector() && "Custom lowering only for vector shifts!");
   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
 
   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
     return V;
 
   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
     return V;
 
   if (SupportedVectorVarShift(VT, Subtarget, Opc))
     return Op;
 
   // XOP has 128-bit variable logical/arithmetic shifts.
   // +ve/-ve Amt = shift left/right.
   if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
                              VT == MVT::v8i16 || VT == MVT::v16i8)) {
     if (Opc == ISD::SRL || Opc == ISD::SRA) {
       SDValue Zero = DAG.getConstant(0, dl, VT);
       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
     }
     if (Opc == ISD::SHL || Opc == ISD::SRL)
       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
     if (Opc == ISD::SRA)
       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
   }
 
   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
   // shifts per-lane and then shuffle the partial results back together.
   if (VT == MVT::v2i64 && Opc != ISD::SRA) {
     // Splat the shift amounts so the scalar shifts above will catch it.
     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
   }
 
   // i64 vector arithmetic shift can be emulated with the transform:
   // M = lshr(SIGN_MASK, Amt)
   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
       Opc == ISD::SRA) {
     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
     return R;
   }
 
   // If possible, lower this shift as a sequence of two shifts by
   // constant plus a BLENDing shuffle instead of scalarizing it.
   // Example:
   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
   //
   // Could be rewritten as:
   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
   //
   // The advantage is that the two shifts from the example would be
   // lowered as X86ISD::VSRLI nodes in parallel before blending.
   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
                       (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
     SDValue Amt1, Amt2;
     unsigned NumElts = VT.getVectorNumElements();
     SmallVector<int, 8> ShuffleMask;
     for (unsigned i = 0; i != NumElts; ++i) {
       SDValue A = Amt->getOperand(i);
       if (A.isUndef()) {
         ShuffleMask.push_back(SM_SentinelUndef);
         continue;
       }
       if (!Amt1 || Amt1 == A) {
         ShuffleMask.push_back(i);
         Amt1 = A;
         continue;
       }
       if (!Amt2 || Amt2 == A) {
         ShuffleMask.push_back(i + NumElts);
         Amt2 = A;
         continue;
       }
       break;
     }
 
     // Only perform this blend if we can perform it without loading a mask.
     if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
         (VT != MVT::v16i16 ||
          is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
         (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
          canWidenShuffleElements(ShuffleMask))) {
       auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
       auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
       if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
           Cst2->getAPIntValue().ult(EltSizeInBits)) {
         SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
                                                     Cst1->getZExtValue(), DAG);
         SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
                                                     Cst2->getZExtValue(), DAG);
         return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
       }
     }
   }
 
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   if (Opc == ISD::SHL)
     if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
       return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
 
   // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
   // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
   if (Opc == ISD::SRL && ConstantAmt &&
       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
       SDValue Zero = DAG.getConstant(0, dl, VT);
       SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
       SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
       return DAG.getSelect(dl, VT, ZAmt, R, Res);
     }
   }
 
   // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
   // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
   // TODO: Special case handling for shift by 0/1, really we can afford either
   // of these cases in pre-SSE41/XOP/AVX512 but not both.
   if (Opc == ISD::SRA && ConstantAmt &&
       (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
       ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
         !Subtarget.hasAVX512()) ||
        DAG.isKnownNeverZero(Amt))) {
     SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
     SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
     if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
       SDValue Amt0 =
           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
       SDValue Amt1 =
           DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
       SDValue Sra1 =
           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
       SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
       Res = DAG.getSelect(dl, VT, Amt0, R, Res);
       return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
     }
   }
 
   // v4i32 Non Uniform Shifts.
   // If the shift amount is constant we can shift each lane using the SSE2
   // immediate shifts, else we need to zero-extend each lane to the lower i64
   // and shift using the SSE2 variable shifts.
   // The separate results can then be blended together.
   if (VT == MVT::v4i32) {
     SDValue Amt0, Amt1, Amt2, Amt3;
     if (ConstantAmt) {
       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
     } else {
       // The SSE2 shifts use the lower i64 as the same shift amount for
       // all lanes and the upper i64 is ignored. On AVX we're better off
       // just zero-extending, but for SSE just duplicating the top 16-bits is
       // cheaper and has the same effect for out of range values.
       if (Subtarget.hasAVX()) {
         SDValue Z = DAG.getConstant(0, dl, VT);
         Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
         Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
         Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
         Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
       } else {
         SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
         SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
                                              {4, 5, 6, 7, -1, -1, -1, -1});
         Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
                                     {0, 1, 1, 1, -1, -1, -1, -1});
         Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
                                     {2, 3, 3, 3, -1, -1, -1, -1});
         Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
                                     {0, 1, 1, 1, -1, -1, -1, -1});
         Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
                                     {2, 3, 3, 3, -1, -1, -1, -1});
       }
     }
 
     unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
     SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
     SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
     SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
     SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
 
     // Merge the shifted lane results optimally with/without PBLENDW.
     // TODO - ideally shuffle combining would handle this.
     if (Subtarget.hasSSE41()) {
       SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
       SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
       return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
     }
     SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
     SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
     return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
   }
 
   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
   // make the existing SSE solution better.
   // NOTE: We honor prefered vector width before promoting to 512-bits.
   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
       (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
       (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
       (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
     assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
            "Unexpected vector type");
     MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
     unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
     Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
     return DAG.getNode(ISD::TRUNCATE, dl, VT,
                        DAG.getNode(Opc, dl, ExtVT, R, Amt));
   }
 
   // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
   // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
   if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
       (VT == MVT::v16i8 || VT == MVT::v64i8 ||
        (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
       !Subtarget.hasXOP()) {
     int NumElts = VT.getVectorNumElements();
     SDValue Cst8 = DAG.getConstant(8, dl, MVT::i8);
 
     // Extend constant shift amount to vXi16 (it doesn't matter if the type
     // isn't legal).
     MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
     Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
     Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
     Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
     assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&
            "Constant build vector expected");
 
     if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
       R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)
                           : DAG.getZExtOrTrunc(R, dl, ExVT);
       R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
       R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
       return DAG.getZExtOrTrunc(R, dl, VT);
     }
 
     SmallVector<SDValue, 16> LoAmt, HiAmt;
     for (int i = 0; i != NumElts; i += 16) {
       for (int j = 0; j != 8; ++j) {
         LoAmt.push_back(Amt.getOperand(i + j));
         HiAmt.push_back(Amt.getOperand(i + j + 8));
       }
     }
 
     MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
     SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
     SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
 
     SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
     SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
     LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
     HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
     LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
     HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
     LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
     HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
   }
 
   if (VT == MVT::v16i8 ||
       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
       if (VT.is512BitVector()) {
         // On AVX512BW targets we make use of the fact that VSELECT lowers
         // to a masked blend which selects bytes based just on the sign bit
         // extracted to a mask.
         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
         Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
                            ISD::SETGT);
         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       } else if (Subtarget.hasSSE41()) {
         // On SSE41 targets we make use of the fact that VSELECT lowers
         // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue Z = DAG.getConstant(0, dl, SelVT);
       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
       return DAG.getSelect(dl, SelVT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
     // We can safely do this using i16 shifts as we're only interested in
     // the 3 lower bits of each byte.
     Amt = DAG.getBitcast(ExtVT, Amt);
     Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
     Amt = DAG.getBitcast(VT, Amt);
 
     if (Opc == ISD::SHL || Opc == ISD::SRL) {
       // r = VSELECT(r, shift(r, 4), a);
       SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
       R = SignBitSelect(VT, Amt, M, R);
 
       // a += a
       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
       // r = VSELECT(r, shift(r, 2), a);
       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
       R = SignBitSelect(VT, Amt, M, R);
 
       // a += a
       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
       // return VSELECT(r, shift(r, 1), a);
       M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
       R = SignBitSelect(VT, Amt, M, R);
       return R;
     }
 
     if (Opc == ISD::SRA) {
       // For SRA we need to unpack each byte to the higher byte of a i16 vector
       // so we can correctly sign extend. We don't care what happens to the
       // lower byte.
       SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
       SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
       SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
       SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
       ALo = DAG.getBitcast(ExtVT, ALo);
       AHi = DAG.getBitcast(ExtVT, AHi);
       RLo = DAG.getBitcast(ExtVT, RLo);
       RHi = DAG.getBitcast(ExtVT, RHi);
 
       // r = VSELECT(r, shift(r, 4), a);
       SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
       SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
 
       // a += a
       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
 
       // r = VSELECT(r, shift(r, 2), a);
       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
 
       // a += a
       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
 
       // r = VSELECT(r, shift(r, 1), a);
       MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
       MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
 
       // Logical shift the result back to the lower byte, leaving a zero upper
       // byte meaning that we can safely pack with PACKUSWB.
       RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
       RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
     }
   }
 
   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
     MVT ExtVT = MVT::v8i32;
     SDValue Z = DAG.getConstant(0, dl, VT);
     SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
     SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
     SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
     SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
     ALo = DAG.getBitcast(ExtVT, ALo);
     AHi = DAG.getBitcast(ExtVT, AHi);
     RLo = DAG.getBitcast(ExtVT, RLo);
     RHi = DAG.getBitcast(ExtVT, RHi);
     SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
     SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
     Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
     Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
   }
 
   if (VT == MVT::v8i16) {
     // If we have a constant shift amount, the non-SSE41 path is best as
     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
     bool UseSSE41 = Subtarget.hasSSE41() &&
                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
 
     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
       // On SSE41 targets we make use of the fact that VSELECT lowers
       // to PBLENDVB which selects bytes based just on the sign bit.
       if (UseSSE41) {
         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
         V0 = DAG.getBitcast(ExtVT, V0);
         V1 = DAG.getBitcast(ExtVT, V1);
         Sel = DAG.getBitcast(ExtVT, Sel);
         return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we splat the sign bit - a negative value will
       // set all bits of the lanes to true and VSELECT uses that in
       // its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue C =
           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
       return DAG.getSelect(dl, VT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
     if (UseSSE41) {
       // On SSE41 targets we need to replicate the shift mask in both
       // bytes for PBLENDVB.
       Amt = DAG.getNode(
           ISD::OR, dl, VT,
           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
           getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
     } else {
       Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
     }
 
     // r = VSELECT(r, shift(r, 8), a);
     SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
     R = SignBitSelect(Amt, M, R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
     // r = VSELECT(r, shift(r, 4), a);
     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
     R = SignBitSelect(Amt, M, R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
     // r = VSELECT(r, shift(r, 2), a);
     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
     R = SignBitSelect(Amt, M, R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
 
     // return VSELECT(r, shift(r, 1), a);
     M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
     R = SignBitSelect(Amt, M, R);
     return R;
   }
 
   // Decompose 256-bit shifts into 128-bit shifts.
   if (VT.is256BitVector())
     return split256IntArith(Op, DAG);
 
   return SDValue();
 }
 
 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   assert(VT.isVector() && "Custom lowering only for vector rotates!");
 
   SDLoc DL(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   unsigned Opcode = Op.getOpcode();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   int NumElts = VT.getVectorNumElements();
 
   // Check for constant splat rotation amount.
   APInt UndefElts;
   SmallVector<APInt, 32> EltBits;
   int CstSplatIndex = -1;
   if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
     for (int i = 0; i != NumElts; ++i)
       if (!UndefElts[i]) {
         if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
           CstSplatIndex = i;
           continue;
         }
         CstSplatIndex = -1;
         break;
       }
 
   // AVX512 implicitly uses modulo rotation amounts.
   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
     // Attempt to rotate by immediate.
     if (0 <= CstSplatIndex) {
       unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
       uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
       return DAG.getNode(Op, DL, VT, R,
                          DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
 
     // Else, fall-back on VPROLV/VPRORV.
     return Op;
   }
 
   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
 
   // XOP has 128-bit vector variable + immediate rotates.
   // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
   // XOP implicitly uses modulo rotation amounts.
   if (Subtarget.hasXOP()) {
     if (VT.is256BitVector())
       return split256IntArith(Op, DAG);
     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
 
     // Attempt to rotate by immediate.
     if (0 <= CstSplatIndex) {
       uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
                          DAG.getConstant(RotateAmt, DL, MVT::i8));
     }
 
     // Use general rotate by variable (per-element).
     return Op;
   }
 
   // Split 256-bit integers on pre-AVX2 targets.
   if (VT.is256BitVector() && !Subtarget.hasAVX2())
     return split256IntArith(Op, DAG);
 
   assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
           ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
            Subtarget.hasAVX2())) &&
          "Only vXi32/vXi16/vXi8 vector rotates supported");
 
   // Rotate by an uniform constant - expand back to shifts.
   if (0 <= CstSplatIndex)
     return SDValue();
 
   bool IsSplatAmt = DAG.isSplatValue(Amt);
 
   // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
   // the amount bit.
   if (EltSizeInBits == 8 && !IsSplatAmt) {
     if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()))
       return SDValue();
 
     // We don't need ModuloAmt here as we just peek at individual bits.
     MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
       if (Subtarget.hasSSE41()) {
         // On SSE41 targets we make use of the fact that VSELECT lowers
         // to PBLENDVB which selects bytes based just on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
         return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
       SDValue Z = DAG.getConstant(0, DL, SelVT);
       SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
       return DAG.getSelect(DL, SelVT, C, V0, V1);
     };
 
     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
     // We can safely do this using i16 shifts as we're only interested in
     // the 3 lower bits of each byte.
     Amt = DAG.getBitcast(ExtVT, Amt);
     Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
     Amt = DAG.getBitcast(VT, Amt);
 
     // r = VSELECT(r, rot(r, 4), a);
     SDValue M;
     M = DAG.getNode(
         ISD::OR, DL, VT,
         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
     R = SignBitSelect(VT, Amt, M, R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
 
     // r = VSELECT(r, rot(r, 2), a);
     M = DAG.getNode(
         ISD::OR, DL, VT,
         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
     R = SignBitSelect(VT, Amt, M, R);
 
     // a += a
     Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
 
     // return VSELECT(r, rot(r, 1), a);
     M = DAG.getNode(
         ISD::OR, DL, VT,
         DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
         DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
     return SignBitSelect(VT, Amt, M, R);
   }
 
   // ISD::ROT* uses modulo rotate amounts.
   Amt = DAG.getNode(ISD::AND, DL, VT, Amt,
                     DAG.getConstant(EltSizeInBits - 1, DL, VT));
 
   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
   bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
                         SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
 
   // Fallback for splats + all supported variable shifts.
   // Fallback for non-constants AVX2 vXi16 as well.
   if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
     SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
     AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
     SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
     SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
     return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
   }
 
   // As with shifts, convert the rotation amount to a multiplication factor.
   SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
   assert(Scale && "Failed to convert ROTL amount to scale");
 
   // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
   if (EltSizeInBits == 16) {
     SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
     SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
     return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
   }
 
   // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
   // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
   // that can then be OR'd with the lower 32-bits.
   assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
   static const int OddMask[] = {1, -1, 3, -1};
   SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
   SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
 
   SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
                               DAG.getBitcast(MVT::v2i64, R),
                               DAG.getBitcast(MVT::v2i64, Scale));
   SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
                               DAG.getBitcast(MVT::v2i64, R13),
                               DAG.getBitcast(MVT::v2i64, Scale13));
   Res02 = DAG.getBitcast(VT, Res02);
   Res13 = DAG.getBitcast(VT, Res13);
 
   return DAG.getNode(ISD::OR, DL, VT,
                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
                      DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
 }
 
 /// Returns true if the operand type is exactly twice the native width, and
 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
 
   if (OpWidth == 64)
     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
   else if (OpWidth == 128)
     return Subtarget.hasCmpxchg16b();
   else
     return false;
 }
 
 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   return needsCmpXchgNb(SI->getValueOperand()->getType());
 }
 
 // Note: this turns large loads into lock cmpxchg8b/16b.
 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   auto PTy = cast<PointerType>(LI->getPointerOperandType());
   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
                                                : AtomicExpansionKind::None;
 }
 
 TargetLowering::AtomicExpansionKind
 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   Type *MemType = AI->getType();
 
   // If the operand is too big, we must see if cmpxchg8/16b is available
   // and default to library calls otherwise.
   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
                                    : AtomicExpansionKind::None;
   }
 
   AtomicRMWInst::BinOp Op = AI->getOperation();
   switch (Op) {
   default:
     llvm_unreachable("Unknown atomic operation");
   case AtomicRMWInst::Xchg:
   case AtomicRMWInst::Add:
   case AtomicRMWInst::Sub:
     // It's better to use xadd, xsub or xchg for these in all cases.
     return AtomicExpansionKind::None;
   case AtomicRMWInst::Or:
   case AtomicRMWInst::And:
   case AtomicRMWInst::Xor:
     // If the atomicrmw's result isn't actually used, we can just add a "lock"
     // prefix to a normal instruction for these operations.
     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
                             : AtomicExpansionKind::None;
   case AtomicRMWInst::Nand:
   case AtomicRMWInst::Max:
   case AtomicRMWInst::Min:
   case AtomicRMWInst::UMax:
   case AtomicRMWInst::UMin:
     // These always require a non-trivial set of data operations on x86. We must
     // use a cmpxchg loop.
     return AtomicExpansionKind::CmpXChg;
   }
 }
 
 LoadInst *
 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
   Type *MemType = AI->getType();
   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
   // there is no benefit in turning such RMWs into loads, and it is actually
   // harmful as it introduces a mfence.
   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
     return nullptr;
 
   auto Builder = IRBuilder<>(AI);
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   auto SSID = AI->getSyncScopeID();
   // We must restrict the ordering to avoid generating loads with Release or
   // ReleaseAcquire orderings.
   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
   auto Ptr = AI->getPointerOperand();
 
   // Before the load we need a fence. Here is an example lifted from
   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
   // is required:
   // Thread 0:
   //   x.store(1, relaxed);
   //   r1 = y.fetch_add(0, release);
   // Thread 1:
   //   y.fetch_add(42, acquire);
   //   r2 = x.load(relaxed);
   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
   // lowered to just a load without a fence. A mfence flushes the store buffer,
   // making the optimization clearly correct.
   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
   // otherwise, we might be able to be more aggressive on relaxed idempotent
   // rmw. In practice, they do not look useful, so we don't try to be
   // especially clever.
   if (SSID == SyncScope::SingleThread)
     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
     // the IR level, so we must wrap it in an intrinsic.
     return nullptr;
 
   if (!Subtarget.hasMFence())
     // FIXME: it might make sense to use a locked operation here but on a
     // different cache-line to prevent cache-line bouncing. In practice it
     // is probably a small win, and x86 processors without mfence are rare
     // enough that we do not bother.
     return nullptr;
 
   Function *MFence =
       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
   Builder.CreateCall(MFence, {});
 
   // Finally we can emit the atomic load.
   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
           AI->getType()->getPrimitiveSizeInBits());
   Loaded->setAtomic(Order, SSID);
   AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
   return Loaded;
 }
 
 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
                                  SelectionDAG &DAG) {
   SDLoc dl(Op);
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
 
   // The only fence that needs an instruction is a sequentially-consistent
   // cross-thread fence.
   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
       FenceSSID == SyncScope::System) {
     if (Subtarget.hasMFence())
       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 
     SDValue Chain = Op.getOperand(0);
     SDValue Zero = DAG.getTargetConstant(0, dl, MVT::i32);
     SDValue Ops[] = {
       DAG.getRegister(X86::ESP, MVT::i32),     // Base
       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
       DAG.getRegister(0, MVT::i32),            // Index
       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
       DAG.getRegister(0, MVT::i32),            // Segment.
       Zero,
       Chain
     };
     SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, dl, MVT::Other, Ops);
     return SDValue(Res, 0);
   }
 
   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
 }
 
 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
   MVT T = Op.getSimpleValueType();
   SDLoc DL(Op);
   unsigned Reg = 0;
   unsigned size = 0;
   switch(T.SimpleTy) {
   default: llvm_unreachable("Invalid value type!");
   case MVT::i8:  Reg = X86::AL;  size = 1; break;
   case MVT::i16: Reg = X86::AX;  size = 2; break;
   case MVT::i32: Reg = X86::EAX; size = 4; break;
   case MVT::i64:
     assert(Subtarget.is64Bit() && "Node not type legal!");
     Reg = X86::RAX; size = 8;
     break;
   }
   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
                                   Op.getOperand(2), SDValue());
   SDValue Ops[] = { cpIn.getValue(0),
                     Op.getOperand(1),
                     Op.getOperand(3),
                     DAG.getTargetConstant(size, DL, MVT::i8),
                     cpIn.getValue(1) };
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
                                            Ops, T, MMO);
 
   SDValue cpOut =
     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
                                       MVT::i32, cpOut.getValue(2));
   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
 
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
   return SDValue();
 }
 
 // Create MOVMSKB, taking into account whether we need to split for AVX1.
 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   MVT InVT = V.getSimpleValueType();
 
   if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
     SDValue Lo, Hi;
     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
     Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
                      DAG.getConstant(16, DL, MVT::i8));
     return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
   }
 
   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
 }
 
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
   SDValue Src = Op.getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
 
   // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
   // half to v32i1 and concatenating the result.
   if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
     assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
     assert(Subtarget.hasBWI() && "Expected BWI target");
     SDLoc dl(Op);
     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
                              DAG.getIntPtrConstant(0, dl));
     Lo = DAG.getBitcast(MVT::v32i1, Lo);
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
                              DAG.getIntPtrConstant(1, dl));
     Hi = DAG.getBitcast(MVT::v32i1, Hi);
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   }
 
   // Custom splitting for BWI types when AVX512F is available but BWI isn't.
   if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
     DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
     SDLoc dl(Op);
     SDValue Lo, Hi;
     std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
     EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
                                   DstVT.getVectorNumElements() / 2);
     Lo = DAG.getBitcast(CastVT, Lo);
     Hi = DAG.getBitcast(CastVT, Hi);
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
   }
 
   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
     MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
     SDLoc DL(Op);
     SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
     V = getPMOVMSKB(DL, V, DAG, Subtarget);
     return DAG.getZExtOrTrunc(V, DL, DstVT);
   }
 
   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
       SrcVT == MVT::i64) {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     if (DstVT != MVT::f64 && DstVT != MVT::i64 &&
         !(DstVT == MVT::x86mmx && SrcVT.isVector()))
       // This conversion needs to be expanded.
       return SDValue();
 
     SDLoc dl(Op);
     if (SrcVT.isVector()) {
       // Widen the vector in input in the case of MVT::v2i32.
       // Example: from MVT::v2i32 to MVT::v4i32.
       MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),
                                    SrcVT.getVectorNumElements() * 2);
       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
                         DAG.getUNDEF(SrcVT));
     } else {
       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
              "Unexpected source type in LowerBITCAST");
       Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
     }
 
     MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
     Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
 
     if (DstVT == MVT::x86mmx)
       return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
 
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
                        DAG.getIntPtrConstant(0, dl));
   }
 
   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
          Subtarget.hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
          "Unexpected custom BITCAST");
   // i64 <=> MMX conversions are Legal.
   if (SrcVT==MVT::i64 && DstVT.isVector())
     return Op;
   if (DstVT==MVT::i64 && SrcVT.isVector())
     return Op;
   // MMX <=> MMX conversions are Legal.
   if (SrcVT.isVector() && DstVT.isVector())
     return Op;
   // All other conversions need to be expanded.
   return SDValue();
 }
 
 /// Compute the horizontal sum of bytes in V for the elements of VT.
 ///
 /// Requires V to be a byte vector and VT to be an integer vector type with
 /// wider elements than V's type. The width of the elements of VT determines
 /// how many bytes of V are summed horizontally to produce each element of the
 /// result.
 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG) {
   SDLoc DL(V);
   MVT ByteVecVT = V.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
          "Expected value to have byte element type.");
   assert(EltVT != MVT::i8 &&
          "Horizontal byte sum only makes sense for wider elements!");
   unsigned VecSize = VT.getSizeInBits();
   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
 
   // PSADBW instruction horizontally add all bytes and leave the result in i64
   // chunks, thus directly computes the pop count for v2i64 and v4i64.
   if (EltVT == MVT::i64) {
     SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
     return DAG.getBitcast(VT, V);
   }
 
   if (EltVT == MVT::i32) {
     // We unpack the low half and high half into i32s interleaved with zeros so
     // that we can use PSADBW to horizontally sum them. The most useful part of
     // this is that it lines up the results of two PSADBW instructions to be
     // two v2i64 vectors which concatenated are the 4 population counts. We can
     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
     SDValue Zeros = DAG.getConstant(0, DL, VT);
     SDValue V32 = DAG.getBitcast(VT, V);
     SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
     SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
 
     // Do the horizontal sums into two v2i64s.
     Zeros = DAG.getConstant(0, DL, ByteVecVT);
     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
                       DAG.getBitcast(ByteVecVT, Low), Zeros);
     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
                        DAG.getBitcast(ByteVecVT, High), Zeros);
 
     // Merge them together.
     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
                     DAG.getBitcast(ShortVecVT, Low),
                     DAG.getBitcast(ShortVecVT, High));
 
     return DAG.getBitcast(VT, V);
   }
 
   // The only element type left is i16.
   assert(EltVT == MVT::i16 && "Unknown how to handle type");
 
   // To obtain pop count for each i16 element starting from the pop count for
   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
   // right by 8. It is important to shift as i16s as i8 vector shift isn't
   // directly supported.
   SDValue ShifterV = DAG.getConstant(8, DL, VT);
   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
                   DAG.getBitcast(ByteVecVT, V));
   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
 }
 
 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
   int NumElts = VT.getVectorNumElements();
   (void)EltVT;
   assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
 
   // Implement a lookup table in register by using an algorithm based on:
   // http://wm.ite.pl/articles/sse-popcount.html
   //
   // The general idea is that every lower byte nibble in the input vector is an
   // index into a in-register pre-computed pop count table. We then split up the
   // input vector in two new ones: (1) a vector with only the shifted-right
   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
   // masked out higher ones) for each byte. PSHUFB is used separately with both
   // to index the in-register table. Next, both are added and the result is a
   // i8 vector where each element contains the pop count for input byte.
   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
 
   SmallVector<SDValue, 64> LUTVec;
   for (int i = 0; i < NumElts; ++i)
     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
   SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
   SDValue M0F = DAG.getConstant(0x0F, DL, VT);
 
   // High nibbles
   SDValue FourV = DAG.getConstant(4, DL, VT);
   SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
 
   // Low nibbles
   SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
 
   // The input vector is used as the shuffle mask that index elements into the
   // LUT. After counting low and high nibbles, add the vector to obtain the
   // final pop count per i8 element.
   SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
   SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
   return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
 }
 
 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
          "Unknown CTPOP type to handle");
   SDLoc DL(Op.getNode());
   SDValue Op0 = Op.getOperand(0);
 
   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
   if (Subtarget.hasVPOPCNTDQ()) {
     unsigned NumElems = VT.getVectorNumElements();
     assert((VT.getVectorElementType() == MVT::i8 ||
             VT.getVectorElementType() == MVT::i16) && "Unexpected type");
     if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
       MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
       Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
       Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
       return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
     }
   }
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
 
   // Decompose 512-bit ops into smaller 256-bit ops.
   if (VT.is512BitVector() && !Subtarget.hasBWI())
     return Lower512IntUnary(Op, DAG);
 
   // For element types greater than i8, do vXi8 pop counts and a bytesum.
   if (VT.getScalarType() != MVT::i8) {
     MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
     SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
     SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
     return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
   }
 
   // We can't use the fast LUT approach, so fall back on LegalizeDAG.
   if (!Subtarget.hasSSSE3())
     return SDValue();
 
   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
 }
 
 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().isVector() &&
          "We only do custom lowering for vector population count.");
   return LowerVectorCTPOP(Op, Subtarget, DAG);
 }
 
 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
   // For scalars, its still beneficial to transfer to/from the SIMD unit to
   // perform the BITREVERSE.
   if (!VT.isVector()) {
     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   int NumElts = VT.getVectorNumElements();
   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector())
     return Lower256IntUnary(Op, DAG);
 
   assert(VT.is128BitVector() &&
          "Only 128-bit vector bitreverse lowering supported.");
 
   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
   // perform the BSWAP in the shuffle.
   // Its best to shuffle using the second operand as this will implicitly allow
   // memory folding for multiple vectors.
   SmallVector<SDValue, 16> MaskElts;
   for (int i = 0; i != NumElts; ++i) {
     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
       int PermuteByte = SourceByte | (2 << 5);
       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
     }
   }
 
   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
                     Res, Mask);
   return DAG.getBitcast(VT, Res);
 }
 
 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
   if (Subtarget.hasXOP() && !VT.is512BitVector())
     return LowerBITREVERSE_XOP(Op, DAG);
 
   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
 
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
   unsigned NumElts = VT.getVectorNumElements();
   assert(VT.getScalarType() == MVT::i8 &&
          "Only byte vector BITREVERSE supported");
 
   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
     return Lower256IntUnary(Op, DAG);
 
   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
   // two nibbles and a PSHUFB lookup to find the bitreverse of each
   // 0-15 value (moved to the other nibble).
   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
 
   const int LoLUT[16] = {
       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
   const int HiLUT[16] = {
       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
 
   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
   for (unsigned i = 0; i < NumElts; ++i) {
     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
   }
 
   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
 }
 
 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   unsigned NewOpc = 0;
   switch (N->getOpcode()) {
   case ISD::ATOMIC_LOAD_ADD:
     NewOpc = X86ISD::LADD;
     break;
   case ISD::ATOMIC_LOAD_SUB:
     NewOpc = X86ISD::LSUB;
     break;
   case ISD::ATOMIC_LOAD_OR:
     NewOpc = X86ISD::LOR;
     break;
   case ISD::ATOMIC_LOAD_XOR:
     NewOpc = X86ISD::LXOR;
     break;
   case ISD::ATOMIC_LOAD_AND:
     NewOpc = X86ISD::LAND;
     break;
   default:
     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
   }
 
   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
 
   return DAG.getMemIntrinsicNode(
       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
       /*MemVT=*/N->getSimpleValueType(0), MMO);
 }
 
 /// Lower atomic_load_ops into LOCK-prefixed operations.
 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget) {
   SDValue Chain = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   unsigned Opc = N->getOpcode();
   MVT VT = N->getSimpleValueType(0);
   SDLoc DL(N);
 
   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
   // can only be lowered when the result is unused.  They should have already
   // been transformed into a cmpxchg loop in AtomicExpand.
   if (N->hasAnyUseOfValue(0)) {
     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
     // select LXADD if LOCK_SUB can't be selected.
     if (Opc == ISD::ATOMIC_LOAD_SUB) {
       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
                            RHS, AN->getMemOperand());
     }
     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
            "Used AtomicRMW ops other than Add should have been expanded!");
     return N;
   }
 
   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
   // RAUW the chain, but don't worry about the result, as it's unused.
   assert(!N->hasAnyUseOfValue(0));
   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
   return SDValue();
 }
 
 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
   SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
   // Convert seq_cst store -> xchg
   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
   // FIXME: On 32-bit, store -> fist or movq would be more efficient
   //        (The only way to get a 16-byte store is cmpxchg16b)
   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
   if (cast<AtomicSDNode>(Node)->getOrdering() ==
           AtomicOrdering::SequentiallyConsistent ||
       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
                                  Node->getOperand(0),
                                  Node->getOperand(1), Node->getOperand(2),
                                  cast<AtomicSDNode>(Node)->getMemOperand());
     return Swap.getValue(1);
   }
   // Other atomic stores have a simple pattern.
   return Op;
 }
 
 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   SDNode *N = Op.getNode();
   MVT VT = N->getSimpleValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   SDLoc DL(N);
 
   // Set the carry flag.
   SDValue Carry = Op.getOperand(2);
   EVT CarryVT = Carry.getValueType();
   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
 
   unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
   SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
                             Op.getOperand(1), Carry.getValue(1));
 
   SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
   if (N->getValueType(1) == MVT::i1)
     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
 
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   // which returns the values as { float, float } (in XMM0) or
   // { double, double } (which is returned in XMM0, XMM1).
   SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
   Entry.IsSExt = false;
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
   bool isF64 = ArgVT == MVT::f64;
   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
   const char *LibcallName = TLI.getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
                       : (Type *)VectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
   if (isF64)
     // Returned in xmm0 and xmm1.
     return CallResult.first;
 
   // Returned in bits 0:31 and 32:64 xmm0.
   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
                                CallResult.first, DAG.getIntPtrConstant(0, dl));
   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
                                CallResult.first, DAG.getIntPtrConstant(1, dl));
   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
 }
 
 /// Widen a vector input to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
                             bool FillWithZeroes = false) {
   // Check if InOp already has the right width.
   MVT InVT = InOp.getSimpleValueType();
   if (InVT == NVT)
     return InOp;
 
   if (InOp.isUndef())
     return DAG.getUNDEF(NVT);
 
   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
          "input and widen element type must match");
 
   unsigned InNumElts = InVT.getVectorNumElements();
   unsigned WidenNumElts = NVT.getVectorNumElements();
   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
          "Unexpected request for vector widening");
 
   SDLoc dl(InOp);
   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
       InOp.getNumOperands() == 2) {
     SDValue N1 = InOp.getOperand(1);
     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
         N1.isUndef()) {
       InOp = InOp.getOperand(0);
       InVT = InOp.getSimpleValueType();
       InNumElts = InVT.getVectorNumElements();
     }
   }
   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
     SmallVector<SDValue, 16> Ops;
     for (unsigned i = 0; i < InNumElts; ++i)
       Ops.push_back(InOp.getOperand(i));
 
     EVT EltVT = InOp.getOperand(0).getValueType();
 
     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
       DAG.getUNDEF(EltVT);
     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
       Ops.push_back(FillVal);
     return DAG.getBuildVector(NVT, dl, Ops);
   }
   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
     DAG.getUNDEF(NVT);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
                      InOp, DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
          "MGATHER/MSCATTER are supported on AVX-512 arch only");
 
   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
   SDValue Src = N->getValue();
   MVT VT = Src.getSimpleValueType();
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
   SDLoc dl(Op);
 
   SDValue Scale = N->getScale();
   SDValue Index = N->getIndex();
   SDValue Mask = N->getMask();
   SDValue Chain = N->getChain();
   SDValue BasePtr = N->getBasePtr();
 
   if (VT == MVT::v2f32) {
     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
     // If the index is v2i64 and we have VLX we can use xmm for data and index.
     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                         DAG.getUNDEF(MVT::v2f32));
       SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
       SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
           VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
       DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
       return SDValue(NewScatter.getNode(), 1);
     }
     return SDValue();
   }
 
   if (VT == MVT::v2i32) {
     assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
     Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
                       DAG.getUNDEF(MVT::v2i32));
     // If the index is v2i64 and we have VLX we can use xmm for data and index.
     if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
       SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
       SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
           VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
       DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
       return SDValue(NewScatter.getNode(), 1);
     }
     // Custom widen all the operands to avoid promotion.
     EVT NewIndexVT = EVT::getVectorVT(
         *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
     Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
                         DAG.getUNDEF(Index.getValueType()));
     Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
                        DAG.getConstant(0, dl, MVT::v2i1));
     SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
     return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
                                 Ops, N->getMemOperand());
   }
 
   MVT IndexVT = Index.getSimpleValueType();
   MVT MaskVT = Mask.getSimpleValueType();
 
   // If the index is v2i32, we're being called by type legalization and we
   // should just let the default handling take care of it.
   if (IndexVT == MVT::v2i32)
     return SDValue();
 
   // If we don't have VLX and neither the passthru or index is 512-bits, we
   // need to widen until one is.
   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
       !Index.getSimpleValueType().is512BitVector()) {
     // Determine how much we need to widen by to get a 512-bit type.
     unsigned Factor = std::min(512/VT.getSizeInBits(),
                                512/IndexVT.getSizeInBits());
     unsigned NumElts = VT.getVectorNumElements() * Factor;
 
     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
 
     Src = ExtendToType(Src, VT, DAG);
     Index = ExtendToType(Index, IndexVT, DAG);
     Mask = ExtendToType(Mask, MaskVT, DAG, true);
   }
 
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
   SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
       VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
   return SDValue(NewScatter.getNode(), 1);
 }
 
 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
 
   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
   MVT VT = Op.getSimpleValueType();
   MVT ScalarVT = VT.getScalarType();
   SDValue Mask = N->getMask();
   SDLoc dl(Op);
 
   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
          "Expanding masked load is supported on AVX-512 target only!");
 
   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
          "Expanding masked load is supported for 32 and 64-bit types only!");
 
   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
          "Cannot lower masked load op.");
 
   assert((ScalarVT.getSizeInBits() >= 32 ||
           (Subtarget.hasBWI() &&
               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
          "Unsupported masked load op.");
 
   // This operation is legal for targets with VLX, but without
   // VLX the vector should be widened to 512 bit
   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
   SDValue PassThru = ExtendToType(N->getPassThru(), WideDataVT, DAG);
 
   // Mask element has to be i1.
   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
          "Unexpected mask type");
 
   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
 
   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
                                       N->getBasePtr(), Mask, PassThru,
                                       N->getMemoryVT(), N->getMemOperand(),
                                       N->getExtensionType(),
                                       N->isExpandingLoad());
 
   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
                                NewLoad.getValue(0),
                                DAG.getIntPtrConstant(0, dl));
   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
   return DAG.getMergeValues(RetOps, dl);
 }
 
 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
                            SelectionDAG &DAG) {
   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
   SDValue DataToStore = N->getValue();
   MVT VT = DataToStore.getSimpleValueType();
   MVT ScalarVT = VT.getScalarType();
   SDValue Mask = N->getMask();
   SDLoc dl(Op);
 
   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
          "Expanding masked load is supported on AVX-512 target only!");
 
   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
          "Expanding masked load is supported for 32 and 64-bit types only!");
 
   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
          "Cannot lower masked store op.");
 
   assert((ScalarVT.getSizeInBits() >= 32 ||
           (Subtarget.hasBWI() &&
               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
           "Unsupported masked store op.");
 
   // This operation is legal for targets with VLX, but without
   // VLX the vector should be widened to 512 bit
   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
 
   // Mask element has to be i1.
   assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
          "Unexpected mask type");
 
   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
 
   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
                             Mask, N->getMemoryVT(), N->getMemOperand(),
                             N->isTruncatingStore(), N->isCompressingStore());
 }
 
 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
                             SelectionDAG &DAG) {
   assert(Subtarget.hasAVX2() &&
          "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
 
   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue Index = N->getIndex();
   SDValue Mask = N->getMask();
   SDValue PassThru = N->getPassThru();
   MVT IndexVT = Index.getSimpleValueType();
   MVT MaskVT = Mask.getSimpleValueType();
 
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
 
   // If the index is v2i32, we're being called by type legalization.
   if (IndexVT == MVT::v2i32)
     return SDValue();
 
   // If we don't have VLX and neither the passthru or index is 512-bits, we
   // need to widen until one is.
   MVT OrigVT = VT;
   if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
       !IndexVT.is512BitVector()) {
     // Determine how much we need to widen by to get a 512-bit type.
     unsigned Factor = std::min(512/VT.getSizeInBits(),
                                512/IndexVT.getSizeInBits());
 
     unsigned NumElts = VT.getVectorNumElements() * Factor;
 
     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
     MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
 
     PassThru = ExtendToType(PassThru, VT, DAG);
     Index = ExtendToType(Index, IndexVT, DAG);
     Mask = ExtendToType(Mask, MaskVT, DAG, true);
   }
 
   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
                     N->getScale() };
   SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
       DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
       N->getMemOperand());
   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
                                 NewGather, DAG.getIntPtrConstant(0, dl));
   return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
 }
 
 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
                                                     SelectionDAG &DAG) const {
   // TODO: Eventually, the lowering of these nodes should be informed by or
   // deferred to the GC strategy for the function in which they appear. For
   // now, however, they must be lowered to something. Since they are logically
   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   // require special handling for these nodes), lower them as literal NOOPs for
   // the time being.
   SmallVector<SDValue, 2> Ops;
 
   Ops.push_back(Op.getOperand(0));
   if (Op->getGluedNode())
     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
 
   SDLoc OpDL(Op);
   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
 
   return NOOP;
 }
 
 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
                                                   SelectionDAG &DAG) const {
   // TODO: Eventually, the lowering of these nodes should be informed by or
   // deferred to the GC strategy for the function in which they appear. For
   // now, however, they must be lowered to something. Since they are logically
   // no-ops in the case of a null GC strategy (or a GC strategy which does not
   // require special handling for these nodes), lower them as literal NOOPs for
   // the time being.
   SmallVector<SDValue, 2> Ops;
 
   Ops.push_back(Op.getOperand(0));
   if (Op->getGluedNode())
     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
 
   SDLoc OpDL(Op);
   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
 
   return NOOP;
 }
 
 /// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     return LowerCMP_SWAP(Op, Subtarget, DAG);
   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::SHL_PARTS:
   case ISD::SRA_PARTS:
   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::FSHL:
   case ISD::FSHR:               return LowerFunnelShift(Op, Subtarget, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   case ISD::ZERO_EXTEND_VECTOR_INREG:
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
   case ISD::FADD:
   case ISD::FSUB:               return lowerFaddFsub(Op, DAG, Subtarget);
   case ISD::FABS:
   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   case ISD::FRAME_TO_ARGS_OFFSET:
                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   case ISD::EH_SJLJ_SETUP_DISPATCH:
     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, Subtarget, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::MULHS:
   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
   case ISD::ROTL:
   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
   case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
   case ISD::ADD:
   case ISD::SUB:                return lowerAddSub(Op, DAG, Subtarget);
   case ISD::UADDSAT:
   case ISD::SADDSAT:
   case ISD::USUBSAT:
   case ISD::SSUBSAT:            return LowerADDSAT_SUBSAT(Op, DAG);
   case ISD::SMAX:
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   case ISD::GC_TRANSITION_START:
                                 return LowerGC_TRANSITION_START(Op, DAG);
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
   }
 }
 
 /// Places new result values for the node in Results (their number
 /// and types must exactly match those of the original return values of
 /// the node), or leaves Results empty, which indicates that the node is not
 /// to be custom lowered after all.
 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
                                               SmallVectorImpl<SDValue> &Results,
                                               SelectionDAG &DAG) const {
   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
 
   if (!Res.getNode())
     return;
 
   assert((N->getNumValues() <= Res->getNumValues()) &&
       "Lowering returned the wrong number of results!");
 
   // Places new result values base on N result number.
   // In some cases (LowerSINT_TO_FP for example) Res has more result values
   // than original node, chain should be dropped(last value).
   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
     Results.push_back(Res.getValue(I));
 }
 
 /// Replace a node with an illegal result type with a new node built out of
 /// custom code.
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
   SDLoc dl(N);
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
   case ISD::MUL: {
     EVT VT = N->getValueType(0);
     assert(VT.isVector() && "Unexpected VT");
     if (getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger &&
         VT.getVectorNumElements() == 2) {
       // Promote to a pattern that will be turned into PMULUDQ.
       SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
                                N->getOperand(0));
       SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64,
                                N->getOperand(1));
       SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1);
       Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul));
     } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
                VT.getVectorElementType() == MVT::i8) {
       // Pre-promote these to vXi16 to avoid op legalization thinking all 16
       // elements are needed.
       MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
       SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
       SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
       SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
       unsigned NumConcats = 16 / VT.getVectorNumElements();
       SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
       ConcatOps[0] = Res;
       Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
       Results.push_back(Res);
     }
     return;
   }
   case ISD::UADDSAT:
   case ISD::SADDSAT:
   case ISD::USUBSAT:
   case ISD::SSUBSAT:
   case X86ISD::VPMADDWD:
   case X86ISD::AVG: {
     // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
     // X86ISD::AVG/VPMADDWD by widening.
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
 
     EVT VT = N->getValueType(0);
     EVT InVT = N->getOperand(0).getValueType();
     assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
            "Expected a VT that divides into 128 bits.");
     unsigned NumConcat = 128 / InVT.getSizeInBits();
 
     EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
                                     InVT.getVectorElementType(),
                                     NumConcat * InVT.getVectorNumElements());
     EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
                                   VT.getVectorElementType(),
                                   NumConcat * VT.getVectorNumElements());
 
     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
     Ops[0] = N->getOperand(0);
     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
     Ops[0] = N->getOperand(1);
     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
 
     SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
       Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
                         DAG.getIntPtrConstant(0, dl));
     Results.push_back(Res);
     return;
   }
   case ISD::SETCC: {
     // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
     // setCC result type is v2i1 because type legalzation will end up with
     // a v4i1 setcc plus an extend.
     assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
     if (N->getOperand(0).getValueType() != MVT::v2f32 ||
         getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector)
       return;
     SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                               N->getOperand(0), UNDEF);
     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                               N->getOperand(1), UNDEF);
     SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
                               N->getOperand(2));
     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
                       DAG.getIntPtrConstant(0, dl));
     Results.push_back(Res);
     return;
   }
   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
   case X86ISD::FMINC:
   case X86ISD::FMIN:
   case X86ISD::FMAXC:
   case X86ISD::FMAX: {
     EVT VT = N->getValueType(0);
     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
     SDValue UNDEF = DAG.getUNDEF(VT);
     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                               N->getOperand(0), UNDEF);
     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                               N->getOperand(1), UNDEF);
     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
     return;
   }
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SREM:
   case ISD::UREM: {
     EVT VT = N->getValueType(0);
     if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector) {
       // If this RHS is a constant splat vector we can widen this and let
       // division/remainder by constant optimize it.
       // TODO: Can we do something for non-splat?
       APInt SplatVal;
       if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
         unsigned NumConcats = 128 / VT.getSizeInBits();
         SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
         Ops0[0] = N->getOperand(0);
         EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
         SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
         SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
         SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
         Results.push_back(Res);
       }
       return;
     }
 
     if (VT == MVT::v2i32) {
       // Legalize v2i32 div/rem by unrolling. Otherwise we promote to the
       // v2i64 and unroll later. But then we create i64 scalar ops which
       // might be slow in 64-bit mode or require a libcall in 32-bit mode.
       Results.push_back(DAG.UnrollVectorOp(N));
       return;
     }
 
     if (VT.isVector())
       return;
 
     LLVM_FALLTHROUGH;
   }
   case ISD::SDIVREM:
   case ISD::UDIVREM: {
     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
     Results.push_back(V);
     return;
   }
   case ISD::TRUNCATE: {
     MVT VT = N->getSimpleValueType(0);
     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
       return;
 
     // The generic legalizer will try to widen the input type to the same
     // number of elements as the widened result type. But this isn't always
     // the best thing so do some custom legalization to avoid some cases.
     MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
     SDValue In = N->getOperand(0);
     EVT InVT = In.getValueType();
 
     unsigned InBits = InVT.getSizeInBits();
     if (128 % InBits == 0) {
       // 128 bit and smaller inputs should avoid truncate all together and
       // just use a build_vector that will become a shuffle.
       // TODO: Widen and use a shuffle directly?
       MVT InEltVT = InVT.getSimpleVT().getVectorElementType();
       EVT EltVT = VT.getVectorElementType();
       unsigned WidenNumElts = WidenVT.getVectorNumElements();
       SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
       // Use the original element count so we don't do more scalar opts than
       // necessary.
       unsigned MinElts = VT.getVectorNumElements();
       for (unsigned i=0; i < MinElts; ++i) {
         SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
                                   DAG.getIntPtrConstant(i, dl));
         Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
       }
       Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
       return;
     }
     // With AVX512 there are some cases that can use a target specific
     // truncate node to go from 256/512 to less than 128 with zeros in the
     // upper elements of the 128 bit result.
     if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
       // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
       if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
         return;
       }
       // There's one case we can widen to 512 bits and use VTRUNC.
       if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
         In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
                          DAG.getUNDEF(MVT::v4i64));
         Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
         return;
       }
     }
     return;
   }
   case ISD::SIGN_EXTEND_VECTOR_INREG: {
     if (ExperimentalVectorWideningLegalization)
       return;
 
     EVT VT = N->getValueType(0);
     SDValue In = N->getOperand(0);
     EVT InVT = In.getValueType();
     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
         (InVT == MVT::v16i16 || InVT == MVT::v32i8)) {
       // Custom split this so we can extend i8/i16->i32 invec. This is better
       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
       // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
       // we allow the sra from the extend to i32 to be shared by the split.
       EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
                                        InVT.getVectorElementType(),
                                        InVT.getVectorNumElements() / 2);
       MVT ExtendVT = MVT::getVectorVT(MVT::i32,
                                       VT.getVectorNumElements());
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExtractVT,
                        In, DAG.getIntPtrConstant(0, dl));
       In = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, MVT::v4i32, In);
 
       // Fill a vector with sign bits for each element.
       SDValue Zero = DAG.getConstant(0, dl, ExtendVT);
       SDValue SignBits = DAG.getSetCC(dl, ExtendVT, Zero, In, ISD::SETGT);
 
       EVT LoVT, HiVT;
       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
       // Create an unpackl and unpackh to interleave the sign bits then bitcast
       // to vXi64.
       SDValue Lo = getUnpackl(DAG, dl, ExtendVT, In, SignBits);
       Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
       SDValue Hi = getUnpackh(DAG, dl, ExtendVT, In, SignBits);
       Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
 
       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
       Results.push_back(Res);
       return;
     }
     return;
   }
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND: {
     if (!ExperimentalVectorWideningLegalization)
       return;
 
     EVT VT = N->getValueType(0);
     SDValue In = N->getOperand(0);
     EVT InVT = In.getValueType();
     if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
         (InVT == MVT::v4i16 || InVT == MVT::v4i8)) {
       // Custom split this so we can extend i8/i16->i32 invec. This is better
       // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
       // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
       // we allow the sra from the extend to i32 to be shared by the split.
       In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
 
       // Fill a vector with sign bits for each element.
       SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
       SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
 
       // Create an unpackl and unpackh to interleave the sign bits then bitcast
       // to v2i64.
       SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
                                         {0, 4, 1, 5});
       Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
       SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
                                         {2, 6, 3, 7});
       Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
 
       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
       Results.push_back(Res);
       return;
     }
 
     if ((VT == MVT::v16i32 || VT == MVT::v8i64) && InVT.is128BitVector()) {
       // Perform custom splitting instead of the two stage extend we would get
       // by default.
       EVT LoVT, HiVT;
       std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
       assert(isTypeLegal(LoVT) && "Split VT not legal?");
 
       bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
 
       SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG);
 
       // We need to shift the input over by half the number of elements.
       unsigned NumElts = InVT.getVectorNumElements();
       unsigned HalfNumElts = NumElts / 2;
       SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
       for (unsigned i = 0; i != HalfNumElts; ++i)
         ShufMask[i] = i + HalfNumElts;
 
       SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
       Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG);
 
       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
       Results.push_back(Res);
     }
     return;
   }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: {
     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
     EVT VT = N->getValueType(0);
     SDValue Src = N->getOperand(0);
     EVT SrcVT = Src.getValueType();
 
     // Promote these manually to avoid over promotion to v2i64. Type
     // legalization will revisit the v2i32 operation for more cleanup.
     if ((VT == MVT::v2i8 || VT == MVT::v2i16) &&
         getTypeAction(*DAG.getContext(), VT) == TypePromoteInteger) {
       // AVX512DQ provides instructions that produce a v2i64 result.
       if (Subtarget.hasDQI())
         return;
 
       SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v2i32, Src);
       Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
                                                           : ISD::AssertSext,
                         dl, MVT::v2i32, Res,
                         DAG.getValueType(VT.getVectorElementType()));
       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
       Results.push_back(Res);
       return;
     }
 
     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
       if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
         return;
 
       // Try to create a 128 bit vector, but don't exceed a 32 bit element.
       unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
       MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
                                        VT.getVectorNumElements());
       SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
 
       // Preserve what we know about the size of the original result. Except
       // when the result is v2i32 since we can't widen the assert.
       if (PromoteVT != MVT::v2i32)
         Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
                                                             : ISD::AssertSext,
                           dl, PromoteVT, Res,
                           DAG.getValueType(VT.getVectorElementType()));
 
       // Truncate back to the original width.
       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
 
       // Now widen to 128 bits.
       unsigned NumConcats = 128 / VT.getSizeInBits();
       MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
                                       VT.getVectorNumElements() * NumConcats);
       SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
       ConcatOps[0] = Res;
       Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
       Results.push_back(Res);
       return;
     }
 
 
     if (VT == MVT::v2i32) {
       assert((IsSigned || Subtarget.hasAVX512()) &&
              "Can only handle signed conversion without AVX512");
       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
       bool Widenv2i32 =
         getTypeAction(*DAG.getContext(), MVT::v2i32) == TypeWidenVector;
       if (Src.getValueType() == MVT::v2f64) {
         unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
         if (!IsSigned && !Subtarget.hasVLX()) {
           // If v2i32 is widened, we can defer to the generic legalizer.
           if (Widenv2i32)
             return;
           // Custom widen by doubling to a legal vector with. Isel will
           // further widen to v8f64.
           Opc = ISD::FP_TO_UINT;
           Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64,
                             Src, DAG.getUNDEF(MVT::v2f64));
         }
         SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
         if (!Widenv2i32)
           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
                             DAG.getIntPtrConstant(0, dl));
         Results.push_back(Res);
         return;
       }
       if (SrcVT == MVT::v2f32 &&
           getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
         SDValue Idx = DAG.getIntPtrConstant(0, dl);
         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                   DAG.getUNDEF(MVT::v2f32));
         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
                                    : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
         Results.push_back(Res);
         return;
       }
 
       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
       // so early out here.
       return;
     }
 
     if (Subtarget.hasDQI() && VT == MVT::i64 &&
         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
       assert(!Subtarget.is64Bit() && "i64 should be legal");
       unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
       // Using a 256-bit input here to guarantee 128-bit input for f32 case.
       // TODO: Use 128-bit vectors for f64 case?
       // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
       MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
       MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
 
       SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
       SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
                                 DAG.getConstantFP(0.0, dl, VecInVT), Src,
                                 ZeroIdx);
       Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
       Results.push_back(Res);
       return;
     }
 
     std::pair<SDValue,SDValue> Vals =
         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
     SDValue FIST = Vals.first, StackSlot = Vals.second;
     if (FIST.getNode()) {
       // Return a load from the stack slot.
       if (StackSlot.getNode())
         Results.push_back(
             DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
       else
         Results.push_back(FIST);
     }
     return;
   }
   case ISD::SINT_TO_FP: {
     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
     SDValue Src = N->getOperand(0);
     if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
       return;
     Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
     return;
   }
   case ISD::UINT_TO_FP: {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     EVT VT = N->getValueType(0);
     if (VT != MVT::v2f32)
       return;
     SDValue Src = N->getOperand(0);
     EVT SrcVT = Src.getValueType();
     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
       Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
       return;
     }
     if (SrcVT != MVT::v2i32)
       return;
     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
     SDValue VBias =
         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
                              DAG.getBitcast(MVT::v2i64, VBias));
     Or = DAG.getBitcast(MVT::v2f64, Or);
     // TODO: Are there any fast-math-flags to propagate here?
     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
     return;
   }
   case ISD::FP_ROUND: {
     if (!isTypeLegal(N->getOperand(0).getValueType()))
         return;
     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
     Results.push_back(V);
     return;
   }
   case ISD::FP_EXTEND: {
     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
     // No other ValueType for FP_EXTEND should reach this point.
     assert(N->getValueType(0) == MVT::v2f32 &&
            "Do not know how to legalize this Node");
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
     switch (IntNo) {
     default : llvm_unreachable("Do not know how to custom type "
                                "legalize this intrinsic operation!");
     case Intrinsic::x86_rdtsc:
       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                      Results);
     case Intrinsic::x86_rdtscp:
       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
                                      Results);
     case Intrinsic::x86_rdpmc:
       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
 
     case Intrinsic::x86_xgetbv:
       return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
     }
   }
   case ISD::INTRINSIC_WO_CHAIN: {
     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
       Results.push_back(V);
     return;
   }
   case ISD::READCYCLECOUNTER: {
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                    Results);
   }
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     EVT T = N->getValueType(0);
     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
     bool Regs64bit = T == MVT::i128;
     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
     SDValue cpInL, cpInH;
     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
                         DAG.getConstant(0, dl, HalfT));
     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
                         DAG.getConstant(1, dl, HalfT));
     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
                              Regs64bit ? X86::RAX : X86::EAX,
                              cpInL, SDValue());
     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
                              Regs64bit ? X86::RDX : X86::EDX,
                              cpInH, cpInL.getValue(1));
     SDValue swapInL, swapInH;
     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
                           DAG.getConstant(0, dl, HalfT));
     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
                           DAG.getConstant(1, dl, HalfT));
     swapInH =
         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
                          swapInH, cpInH.getValue(1));
     // If the current function needs the base pointer, RBX,
     // we shouldn't use cmpxchg directly.
     // Indeed the lowering of that instruction will clobber
     // that register and since RBX will be a reserved register
     // the register allocator will not make sure its value will
     // be properly saved and restored around this live-range.
     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
     SDValue Result;
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     unsigned BasePtr = TRI->getBaseRegister();
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
       // ISel prefers the LCMPXCHG64 variant.
       // If that assert breaks, that means it is not the case anymore,
       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
       // not just EBX. This is a matter of accepting i64 input for that
       // pseudo, and restoring into the register of the right wide
       // in expand pseudo. Everything else should just work.
       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
              "Saving only half of the RBX");
       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
                                            Regs64bit ? X86::RBX : X86::EBX,
                                            HalfT, swapInH.getValue(1));
       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
                        RBXSave,
                        /*Glue*/ RBXSave.getValue(2)};
       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
     } else {
       unsigned Opcode =
           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
                                  swapInH.getValue(1));
       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
                        swapInL.getValue(1)};
       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
     }
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
                                         Regs64bit ? X86::RAX : X86::EAX,
                                         HalfT, Result.getValue(1));
     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
                                         Regs64bit ? X86::RDX : X86::EDX,
                                         HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
 
     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
                                         MVT::i32, cpOutH.getValue(2));
     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
 
     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
     Results.push_back(Success);
     Results.push_back(EFLAGS.getValue(1));
     return;
   }
   case ISD::ATOMIC_SWAP:
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_NAND:
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
   case ISD::ATOMIC_LOAD: {
     // Delegate to generic TypeLegalization. Situations we can really handle
     // should have already been dealt with by AtomicExpandPass.cpp.
     break;
   }
   case ISD::BITCAST: {
     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
     EVT DstVT = N->getValueType(0);
     EVT SrcVT = N->getOperand(0).getValueType();
 
     // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
     // we can split using the k-register rather than memory.
     if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
       assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
       SDValue Lo, Hi;
       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
       Lo = DAG.getBitcast(MVT::i32, Lo);
       Hi = DAG.getBitcast(MVT::i32, Hi);
       SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
       Results.push_back(Res);
       return;
     }
 
     // Custom splitting for BWI types when AVX512F is available but BWI isn't.
     if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
         SrcVT.isVector() && isTypeLegal(SrcVT)) {
       SDValue Lo, Hi;
       std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
       MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
       Lo = DAG.getBitcast(CastVT, Lo);
       Hi = DAG.getBitcast(CastVT, Hi);
       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
       Results.push_back(Res);
       return;
     }
 
     if (SrcVT != MVT::f64 ||
         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8) ||
         getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector)
       return;
 
     unsigned NumElts = DstVT.getVectorNumElements();
     EVT SVT = DstVT.getVectorElementType();
     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
     SDValue Res;
     Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, N->getOperand(0));
     Res = DAG.getBitcast(WiderVT, Res);
     Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, Res,
                       DAG.getIntPtrConstant(0, dl));
     Results.push_back(Res);
     return;
   }
   case ISD::MGATHER: {
     EVT VT = N->getValueType(0);
     if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
       auto *Gather = cast<MaskedGatherSDNode>(N);
       SDValue Index = Gather->getIndex();
       if (Index.getValueType() != MVT::v2i64)
         return;
       SDValue Mask = Gather->getMask();
       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
                                      Gather->getPassThru(),
                                      DAG.getUNDEF(MVT::v2f32));
       if (!Subtarget.hasVLX()) {
         // We need to widen the mask, but the instruction will only use 2
         // of its elements. So we can use undef.
         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
                            DAG.getUNDEF(MVT::v2i1));
         Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
       }
       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
                         Gather->getBasePtr(), Index, Gather->getScale() };
       SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
         DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
         Gather->getMemoryVT(), Gather->getMemOperand());
       Results.push_back(Res);
       Results.push_back(Res.getValue(2));
       return;
     }
     if (VT == MVT::v2i32) {
       auto *Gather = cast<MaskedGatherSDNode>(N);
       SDValue Index = Gather->getIndex();
       SDValue Mask = Gather->getMask();
       assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
       SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
                                      Gather->getPassThru(),
                                      DAG.getUNDEF(MVT::v2i32));
       // If the index is v2i64 we can use it directly.
       if (Index.getValueType() == MVT::v2i64 &&
           (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
         if (!Subtarget.hasVLX()) {
           // We need to widen the mask, but the instruction will only use 2
           // of its elements. So we can use undef.
           Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
                              DAG.getUNDEF(MVT::v2i1));
           Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
         }
         SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
                           Gather->getBasePtr(), Index, Gather->getScale() };
         SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
           DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
           Gather->getMemoryVT(), Gather->getMemOperand());
         SDValue Chain = Res.getValue(2);
         if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
                             DAG.getIntPtrConstant(0, dl));
         Results.push_back(Res);
         Results.push_back(Chain);
         return;
       }
       if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) {
         EVT IndexVT = Index.getValueType();
         EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
                                           IndexVT.getScalarType(), 4);
         // Otherwise we need to custom widen everything to avoid promotion.
         Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
                             DAG.getUNDEF(IndexVT));
         Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
                            DAG.getConstant(0, dl, MVT::v2i1));
         SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
                           Gather->getBasePtr(), Index, Gather->getScale() };
         SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
                                           Gather->getMemoryVT(), dl, Ops,
                                           Gather->getMemOperand());
         SDValue Chain = Res.getValue(1);
         if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
           Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
                             DAG.getIntPtrConstant(0, dl));
         Results.push_back(Res);
         Results.push_back(Chain);
         return;
       }
     }
     return;
   }
   case ISD::LOAD: {
     // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
     // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
     // cast since type legalization will try to use an i64 load.
     MVT VT = N->getSimpleValueType(0);
     assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
     if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
       return;
     if (!ISD::isNON_EXTLoad(N))
       return;
     auto *Ld = cast<LoadSDNode>(N);
     MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
     SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
                               Ld->getPointerInfo(),
                               Ld->getAlignment(),
                               Ld->getMemOperand()->getFlags());
     SDValue Chain = Res.getValue(1);
     MVT WideVT = MVT::getVectorVT(LdVT, 2);
     Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
     MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
                                   VT.getVectorNumElements() * 2);
     Res = DAG.getBitcast(CastVT, Res);
     Results.push_back(Res);
     Results.push_back(Chain);
     return;
   }
   }
 }
 
 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((X86ISD::NodeType)Opcode) {
   case X86ISD::FIRST_NUMBER:       break;
   case X86ISD::BSF:                return "X86ISD::BSF";
   case X86ISD::BSR:                return "X86ISD::BSR";
   case X86ISD::SHLD:               return "X86ISD::SHLD";
   case X86ISD::SHRD:               return "X86ISD::SHRD";
   case X86ISD::FAND:               return "X86ISD::FAND";
   case X86ISD::FANDN:              return "X86ISD::FANDN";
   case X86ISD::FOR:                return "X86ISD::FOR";
   case X86ISD::FXOR:               return "X86ISD::FXOR";
   case X86ISD::FILD:               return "X86ISD::FILD";
   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
   case X86ISD::FLD:                return "X86ISD::FLD";
   case X86ISD::FST:                return "X86ISD::FST";
   case X86ISD::CALL:               return "X86ISD::CALL";
   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
   case X86ISD::BT:                 return "X86ISD::BT";
   case X86ISD::CMP:                return "X86ISD::CMP";
   case X86ISD::COMI:               return "X86ISD::COMI";
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
   case X86ISD::CMPM:               return "X86ISD::CMPM";
   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
   case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
   case X86ISD::FSETCCM_RND:        return "X86ISD::FSETCCM_RND";
   case X86ISD::CMOV:               return "X86ISD::CMOV";
   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
   case X86ISD::IRET:               return "X86ISD::IRET";
   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
   case X86ISD::BLENDV:             return "X86ISD::BLENDV";
   case X86ISD::HADD:               return "X86ISD::HADD";
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMAXS:              return "X86ISD::FMAXS";
   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
   case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
   case X86ISD::FMINS:              return "X86ISD::FMINS";
   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
   case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
   case X86ISD::LADD:               return "X86ISD::LADD";
   case X86ISD::LSUB:               return "X86ISD::LSUB";
   case X86ISD::LOR:                return "X86ISD::LOR";
   case X86ISD::LXOR:               return "X86ISD::LXOR";
   case X86ISD::LAND:               return "X86ISD::LAND";
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
   case X86ISD::VMTRUNC:            return "X86ISD::VMTRUNC";
   case X86ISD::VMTRUNCS:           return "X86ISD::VMTRUNCS";
   case X86ISD::VMTRUNCUS:          return "X86ISD::VMTRUNCUS";
   case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VMFPROUND:          return "X86ISD::VMFPROUND";
   case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
   case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
   case X86ISD::VSRL:               return "X86ISD::VSRL";
   case X86ISD::VSRA:               return "X86ISD::VSRA";
   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
   case X86ISD::VSHLV:              return "X86ISD::VSHLV";
   case X86ISD::VSRLV:              return "X86ISD::VSRLV";
   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
   case X86ISD::PHMINPOS:           return "X86ISD::PHMINPOS";
   case X86ISD::ADD:                return "X86ISD::ADD";
   case X86ISD::SUB:                return "X86ISD::SUB";
   case X86ISD::ADC:                return "X86ISD::ADC";
   case X86ISD::SBB:                return "X86ISD::SBB";
   case X86ISD::SMUL:               return "X86ISD::SMUL";
   case X86ISD::UMUL:               return "X86ISD::UMUL";
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::BZHI:               return "X86ISD::BZHI";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
   case X86ISD::KTEST:              return "X86ISD::KTEST";
   case X86ISD::KADD:               return "X86ISD::KADD";
   case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
   case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
   case X86ISD::VSHLD:              return "X86ISD::VSHLD";
   case X86ISD::VSHRD:              return "X86ISD::VSHRD";
   case X86ISD::VSHLDV:             return "X86ISD::VSHLDV";
   case X86ISD::VSHRDV:             return "X86ISD::VSHRDV";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
   case X86ISD::VFIXUPIMMS:         return "X86ISD::VFIXUPIMMS";
   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
   case X86ISD::VRANGE_RND:         return "X86ISD::VRANGE_RND";
   case X86ISD::VRANGES:            return "X86ISD::VRANGES";
   case X86ISD::VRANGES_RND:        return "X86ISD::VRANGES_RND";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   case X86ISD::SAHF:               return "X86ISD::SAHF";
   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
   case X86ISD::VRNDSCALE_RND:      return "X86ISD::VRNDSCALE_RND";
   case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
   case X86ISD::VRNDSCALES_RND:     return "X86ISD::VRNDSCALES_RND";
   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
   case X86ISD::VREDUCE_RND:        return "X86ISD::VREDUCE_RND";
   case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
   case X86ISD::VREDUCES_RND:       return "X86ISD::VREDUCES_RND";
   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
   case X86ISD::VGETMANT_RND:       return "X86ISD::VGETMANT_RND";
   case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
   case X86ISD::VGETMANTS_RND:      return "X86ISD::VGETMANTS_RND";
   case X86ISD::PCMPESTR:           return "X86ISD::PCMPESTR";
   case X86ISD::PCMPISTR:           return "X86ISD::PCMPISTR";
   case X86ISD::XTEST:              return "X86ISD::XTEST";
   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
   case X86ISD::SELECTS:            return "X86ISD::SELECTS";
   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
   case X86ISD::RCP14:              return "X86ISD::RCP14";
   case X86ISD::RCP14S:             return "X86ISD::RCP14S";
   case X86ISD::RCP28:              return "X86ISD::RCP28";
   case X86ISD::RCP28S:             return "X86ISD::RCP28S";
   case X86ISD::EXP2:               return "X86ISD::EXP2";
   case X86ISD::RSQRT14:            return "X86ISD::RSQRT14";
   case X86ISD::RSQRT14S:           return "X86ISD::RSQRT14S";
   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
   case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
   case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
   case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
   case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
   case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
   case X86ISD::AVG:                return "X86ISD::AVG";
   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
   case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
   case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
   case X86ISD::MCVTTP2SI:          return "X86ISD::MCVTTP2SI";
   case X86ISD::MCVTTP2UI:          return "X86ISD::MCVTTP2UI";
   case X86ISD::CVTTP2SI_RND:       return "X86ISD::CVTTP2SI_RND";
   case X86ISD::CVTTP2UI_RND:       return "X86ISD::CVTTP2UI_RND";
   case X86ISD::CVTTS2SI:           return "X86ISD::CVTTS2SI";
   case X86ISD::CVTTS2UI:           return "X86ISD::CVTTS2UI";
   case X86ISD::CVTTS2SI_RND:       return "X86ISD::CVTTS2SI_RND";
   case X86ISD::CVTTS2UI_RND:       return "X86ISD::CVTTS2UI_RND";
   case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
   case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
   case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
   case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
   case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
   case X86ISD::MCVTPS2PH:          return "X86ISD::MCVTPS2PH";
   case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
   case X86ISD::CVTPH2PS_RND:       return "X86ISD::CVTPH2PS_RND";
   case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
   case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
   case X86ISD::MCVTP2SI:           return "X86ISD::MCVTP2SI";
   case X86ISD::MCVTP2UI:           return "X86ISD::MCVTP2UI";
   case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
   case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
   case X86ISD::CVTS2SI:            return "X86ISD::CVTS2SI";
   case X86ISD::CVTS2UI:            return "X86ISD::CVTS2UI";
   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
   case X86ISD::LWPINS:             return "X86ISD::LWPINS";
   case X86ISD::MGATHER:            return "X86ISD::MGATHER";
   case X86ISD::MSCATTER:           return "X86ISD::MSCATTER";
   case X86ISD::VPDPBUSD:           return "X86ISD::VPDPBUSD";
   case X86ISD::VPDPBUSDS:          return "X86ISD::VPDPBUSDS";
   case X86ISD::VPDPWSSD:           return "X86ISD::VPDPWSSD";
   case X86ISD::VPDPWSSDS:          return "X86ISD::VPDPWSSDS";
   case X86ISD::VPSHUFBITQMB:       return "X86ISD::VPSHUFBITQMB";
   case X86ISD::GF2P8MULB:          return "X86ISD::GF2P8MULB";
   case X86ISD::GF2P8AFFINEQB:      return "X86ISD::GF2P8AFFINEQB";
   case X86ISD::GF2P8AFFINEINVQB:   return "X86ISD::GF2P8AFFINEINVQB";
   case X86ISD::NT_CALL:            return "X86ISD::NT_CALL";
   case X86ISD::NT_BRIND:           return "X86ISD::NT_BRIND";
   case X86ISD::UMWAIT:             return "X86ISD::UMWAIT";
   case X86ISD::TPAUSE:             return "X86ISD::TPAUSE";
   }
   return nullptr;
 }
 
 /// Return true if the addressing mode represented by AM is legal for this
 /// target, for a load/store of the specified type.
 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
                                               unsigned AS,
                                               Instruction *I) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
 
   // X86 allows a sign-extended 32-bit immediate field as a displacement.
   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
     return false;
 
   if (AM.BaseGV) {
     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
 
     // If a reference to this global requires an extra load, we can't fold it.
     if (isGlobalStubReference(GVFlags))
       return false;
 
     // If BaseGV requires a register for the PIC base, we cannot also have a
     // BaseReg specified.
     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
       return false;
 
     // If lower 4G is not available, then we must use rip-relative addressing.
     if ((M != CodeModel::Small || isPositionIndependent()) &&
         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
       return false;
   }
 
   switch (AM.Scale) {
   case 0:
   case 1:
   case 2:
   case 4:
   case 8:
     // These scales always work.
     break;
   case 3:
   case 5:
   case 9:
     // These scales are formed with basereg+scalereg.  Only accept if there is
     // no basereg yet.
     if (AM.HasBaseReg)
       return false;
     break;
   default:  // Other stuff never works.
     return false;
   }
 
   return true;
 }
 
 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
   unsigned Bits = Ty->getScalarSizeInBits();
 
   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
   // particularly cheaper than those without.
   if (Bits == 8)
     return false;
 
   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
   if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
     return false;
 
   // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
   // shifts just as cheap as scalar ones.
   if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
     return false;
 
   // AVX512BW has shifts such as vpsllvw.
   if (Subtarget.hasBWI() && Bits == 16)
       return false;
 
   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   // fully general vector.
   return true;
 }
 
 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   return NumBits1 > NumBits2;
 }
 
 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
 
   if (!isTypeLegal(EVT::getEVT(Ty1)))
     return false;
 
   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
 
   // Assuming the caller doesn't have a zeroext or signext return parameter,
   // truncation all the way down to i1 is valid.
   return true;
 }
 
 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return isInt<32>(Imm);
 }
 
 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
   // Can also use sub to handle negated immediates.
   return isInt<32>(Imm);
 }
 
 bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
   return isInt<32>(Imm);
 }
 
 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (!VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
   return NumBits1 > NumBits2;
 }
 
 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
 }
 
 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
 }
 
 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   EVT VT1 = Val.getValueType();
   if (isZExtFree(VT1, VT2))
     return true;
 
   if (Val.getOpcode() != ISD::LOAD)
     return false;
 
   if (!VT1.isSimple() || !VT1.isInteger() ||
       !VT2.isSimple() || !VT2.isInteger())
     return false;
 
   switch (VT1.getSimpleVT().SimpleTy) {
   default: break;
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
     // X86 has 8, 16, and 32-bit zero-extending loads.
     return true;
   }
 
   return false;
 }
 
 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   EVT SrcVT = ExtVal.getOperand(0).getValueType();
 
   // There is no extending load for vXi1.
   if (SrcVT.getScalarType() == MVT::i1)
     return false;
 
   return true;
 }
 
 bool
 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   if (!Subtarget.hasAnyFMA())
     return false;
 
   VT = VT.getScalarType();
 
   if (!VT.isSimple())
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
   case MVT::f64:
     return true;
   default:
     break;
   }
 
   return false;
 }
 
 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
 }
 
 /// Targets can use this to indicate that they only support *some*
 /// VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
 /// are assumed to be legal.
 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   if (!VT.isSimple())
     return false;
 
   // Not for i1 vectors
   if (VT.getSimpleVT().getScalarType() == MVT::i1)
     return false;
 
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSimpleVT().getSizeInBits() == 64)
     return false;
 
   // We only care that the types being shuffled are legal. The lowering can
   // handle any possible shuffle mask that results.
   return isTypeLegal(VT.getSimpleVT());
 }
 
 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
                                                EVT VT) const {
   // Don't convert an 'and' into a shuffle that we don't directly support.
   // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
   if (!Subtarget.hasAVX2())
     if (VT == MVT::v32i8 || VT == MVT::v16i16)
       return false;
 
   // Just delegate to the generic legality, clear masks aren't special.
   return isShuffleMaskLegal(Mask, VT);
 }
 
 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
   // If the subtarget is using retpolines, we need to not generate jump tables.
   if (Subtarget.useRetpolineIndirectBranches())
     return false;
 
   // Otherwise, fallback on the generic logic.
   return TargetLowering::areJTsAllowed(Fn);
 }
 
 //===----------------------------------------------------------------------===//
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
 /// Utility function to emit xbegin specifying the start of an RTM region.
 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
                                      const TargetInstrInfo *TII) {
   DebugLoc DL = MI.getDebugLoc();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
 
   // For the v = xbegin(), we generate
   //
   // thisMBB:
   //  xbegin sinkMBB
   //
   // mainMBB:
   //  s0 = -1
   //
   // fallBB:
   //  eax = # XABORT_DEF
   //  s1 = eax
   //
   // sinkMBB:
   //  v = phi(s0/mainBB, s1/fallBB)
 
   MachineBasicBlock *thisMBB = MBB;
   MachineFunction *MF = MBB->getParent();
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, mainMBB);
   MF->insert(I, fallMBB);
   MF->insert(I, sinkMBB);
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned DstReg = MI.getOperand(0).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned fallDstReg = MRI.createVirtualRegister(RC);
 
   // thisMBB:
   //  xbegin fallMBB
   //  # fallthrough to mainMBB
   //  # abortion to fallMBB
   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(fallMBB);
 
   // mainMBB:
   //  mainDstReg := -1
   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   mainMBB->addSuccessor(sinkMBB);
 
   // fallMBB:
   //  ; pseudo instruction to model hardware's definition from XABORT
   //  EAX := XABORT_DEF
   //  fallDstReg := EAX
   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
       .addReg(X86::EAX);
   fallMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
       .addReg(mainDstReg).addMBB(mainMBB)
       .addReg(fallDstReg).addMBB(fallMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
 }
 
 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
                                      const X86Subtarget &Subtarget) {
   DebugLoc dl = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   // insert input VAL into EAX
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
       .addReg(MI.getOperand(0).getReg());
   // insert zero to ECX
   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
 
   // insert zero to EDX
   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
 
   // insert WRPKRU instruction
   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
 
   MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
                                      const X86Subtarget &Subtarget) {
   DebugLoc dl = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
 
   // insert zero to ECX
   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
 
   // insert RDPKRU instruction
   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
       .addReg(X86::EAX);
 
   MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
                                       const X86Subtarget &Subtarget,
                                       unsigned Opc) {
   DebugLoc dl = MI.getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   // Address into RAX/EAX, other two args into ECX, EDX.
   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   for (int i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(MI.getOperand(i));
 
   unsigned ValOps = X86::AddrNumOperands;
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
       .addReg(MI.getOperand(ValOps).getReg());
   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
       .addReg(MI.getOperand(ValOps + 1).getReg());
 
   // The instruction doesn't actually take any operands though.
   BuildMI(*BB, MI, dl, TII->get(Opc));
 
   MI.eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
                                       const X86Subtarget &Subtarget) {
   DebugLoc dl = MI->getDebugLoc();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   // Address into RAX/EAX
   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
   for (int i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(MI->getOperand(i));
 
   // The instruction doesn't actually take any operands though.
   BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
 
   MI->eraseFromParent(); // The pseudo is gone now.
   return BB;
 }
 
 
 
 MachineBasicBlock *
 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
   // Emit va_arg instruction on X86-64.
 
   // Operands to this pseudo-instruction:
   // 0  ) Output        : destination address (reg)
   // 1-5) Input         : va_list address (addr, i64mem)
   // 6  ) ArgSize       : Size (in bytes) of vararg type
   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
   // 8  ) Align         : Alignment of type
   // 9  ) EFLAGS (implicit-def)
 
   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
   static_assert(X86::AddrNumOperands == 5,
                 "VAARG_64 assumes 5 address operands");
 
   unsigned DestReg = MI.getOperand(0).getReg();
   MachineOperand &Base = MI.getOperand(1);
   MachineOperand &Scale = MI.getOperand(2);
   MachineOperand &Index = MI.getOperand(3);
   MachineOperand &Disp = MI.getOperand(4);
   MachineOperand &Segment = MI.getOperand(5);
   unsigned ArgSize = MI.getOperand(6).getImm();
   unsigned ArgMode = MI.getOperand(7).getImm();
   unsigned Align = MI.getOperand(8).getImm();
 
   // Memory Reference
   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
   SmallVector<MachineMemOperand *, 1> MMOs(MI.memoperands_begin(),
                                            MI.memoperands_end());
 
   // Machine Information
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   DebugLoc DL = MI.getDebugLoc();
 
   // struct va_list {
   //   i32   gp_offset
   //   i32   fp_offset
   //   i64   overflow_area (address)
   //   i64   reg_save_area (address)
   // }
   // sizeof(va_list) = 24
   // alignment(va_list) = 8
 
   unsigned TotalNumIntRegs = 6;
   unsigned TotalNumXMMRegs = 8;
   bool UseGPOffset = (ArgMode == 1);
   bool UseFPOffset = (ArgMode == 2);
   unsigned MaxOffset = TotalNumIntRegs * 8 +
                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
 
   /* Align ArgSize to a multiple of 8 */
   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
   bool NeedsAlign = (Align > 8);
 
   MachineBasicBlock *thisMBB = MBB;
   MachineBasicBlock *overflowMBB;
   MachineBasicBlock *offsetMBB;
   MachineBasicBlock *endMBB;
 
   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   unsigned OffsetReg = 0;
 
   if (!UseGPOffset && !UseFPOffset) {
     // If we only pull from the overflow region, we don't create a branch.
     // We don't need to alter control flow.
     OffsetDestReg = 0; // unused
     OverflowDestReg = DestReg;
 
     offsetMBB = nullptr;
     overflowMBB = thisMBB;
     endMBB = thisMBB;
   } else {
     // First emit code to check if gp_offset (or fp_offset) is below the bound.
     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
     // If not, pull from overflow_area. (branch to overflowMBB)
     //
     //       thisMBB
     //         |     .
     //         |        .
     //     offsetMBB   overflowMBB
     //         |        .
     //         |     .
     //        endMBB
 
     // Registers for the PHI in endMBB
     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
 
     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
     MachineFunction *MF = MBB->getParent();
     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
     MachineFunction::iterator MBBIter = ++MBB->getIterator();
 
     // Insert the new basic blocks
     MF->insert(MBBIter, offsetMBB);
     MF->insert(MBBIter, overflowMBB);
     MF->insert(MBBIter, endMBB);
 
     // Transfer the remainder of MBB and its successor edges to endMBB.
     endMBB->splice(endMBB->begin(), thisMBB,
                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
 
     // Make offsetMBB and overflowMBB successors of thisMBB
     thisMBB->addSuccessor(offsetMBB);
     thisMBB->addSuccessor(overflowMBB);
 
     // endMBB is a successor of both offsetMBB and overflowMBB
     offsetMBB->addSuccessor(endMBB);
     overflowMBB->addSuccessor(endMBB);
 
     // Load the offset value into a register
     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
         .add(Base)
         .add(Scale)
         .add(Index)
         .addDisp(Disp, UseFPOffset ? 4 : 0)
         .add(Segment)
         .setMemRefs(MMOs);
 
     // Check if there is enough room left to pull this argument.
     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
       .addReg(OffsetReg)
       .addImm(MaxOffset + 8 - ArgSizeA8);
 
     // Branch to "overflowMBB" if offset >= max
     // Fall through to "offsetMBB" otherwise
     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
       .addMBB(overflowMBB);
   }
 
   // In offsetMBB, emit code to use the reg_save_area.
   if (offsetMBB) {
     assert(OffsetReg != 0);
 
     // Read the reg_save_area address.
     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
         .add(Base)
         .add(Scale)
         .add(Index)
         .addDisp(Disp, 16)
         .add(Segment)
         .setMemRefs(MMOs);
 
     // Zero-extend the offset
     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
         .addImm(0)
         .addReg(OffsetReg)
         .addImm(X86::sub_32bit);
 
     // Add the offset to the reg_save_area to get the final address.
     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
       .addReg(OffsetReg64)
       .addReg(RegSaveReg);
 
     // Compute the offset for the next argument
     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
       .addReg(OffsetReg)
       .addImm(UseFPOffset ? 16 : 8);
 
     // Store it back into the va_list.
     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
         .add(Base)
         .add(Scale)
         .add(Index)
         .addDisp(Disp, UseFPOffset ? 4 : 0)
         .add(Segment)
         .addReg(NextOffsetReg)
         .setMemRefs(MMOs);
 
     // Jump to endMBB
     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
       .addMBB(endMBB);
   }
 
   //
   // Emit code to use overflow area
   //
 
   // Load the overflow_area address into a register.
   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
       .add(Base)
       .add(Scale)
       .add(Index)
       .addDisp(Disp, 8)
       .add(Segment)
       .setMemRefs(MMOs);
 
   // If we need to align it, do so. Otherwise, just copy the address
   // to OverflowDestReg.
   if (NeedsAlign) {
     // Align the overflow address
     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
 
     // aligned_addr = (addr + (align-1)) & ~(align-1)
     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
       .addReg(OverflowAddrReg)
       .addImm(Align-1);
 
     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
       .addReg(TmpReg)
       .addImm(~(uint64_t)(Align-1));
   } else {
     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
       .addReg(OverflowAddrReg);
   }
 
   // Compute the next overflow address after this argument.
   // (the overflow address should be kept 8-byte aligned)
   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
     .addReg(OverflowDestReg)
     .addImm(ArgSizeA8);
 
   // Store the new overflow address.
   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
       .add(Base)
       .add(Scale)
       .add(Index)
       .addDisp(Disp, 8)
       .add(Segment)
       .addReg(NextAddrReg)
       .setMemRefs(MMOs);
 
   // If we branched, emit the PHI to the front of endMBB.
   if (offsetMBB) {
     BuildMI(*endMBB, endMBB->begin(), DL,
             TII->get(X86::PHI), DestReg)
       .addReg(OffsetDestReg).addMBB(offsetMBB)
       .addReg(OverflowDestReg).addMBB(overflowMBB);
   }
 
   // Erase the pseudo instruction
   MI.eraseFromParent();
 
   return endMBB;
 }
 
 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *MBB) const {
   // Emit code to save XMM registers to the stack. The ABI says that the
   // number of registers to save is given in %al, so it's theoretically
   // possible to do an indirect jump trick to avoid saving all of them,
   // however this code takes a simpler approach and just executes all
   // of the stores if %al is non-zero. It's less code, and it's probably
   // easier on the hardware branch predictor, and stores aren't all that
   // expensive anyway.
 
   // Create the new basic blocks. One block contains all the XMM stores,
   // and one block is the final destination regardless of whether any
   // stores were performed.
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   MachineFunction *F = MBB->getParent();
   MachineFunction::iterator MBBIter = ++MBB->getIterator();
   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
   F->insert(MBBIter, XMMSaveMBB);
   F->insert(MBBIter, EndMBB);
 
   // Transfer the remainder of MBB and its successor edges to EndMBB.
   EndMBB->splice(EndMBB->begin(), MBB,
                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // The original block will now fall through to the XMM save block.
   MBB->addSuccessor(XMMSaveMBB);
   // The XMMSaveMBB will fall through to the end block.
   XMMSaveMBB->addSuccessor(EndMBB);
 
   // Now add the instructions.
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   unsigned CountReg = MI.getOperand(0).getReg();
   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
 
   if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
     // If %al is 0, branch around the XMM save block.
     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
     MBB->addSuccessor(EndMBB);
   }
 
   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
   // that was just emitted, but clearly shouldn't be "saved".
   assert((MI.getNumOperands() <= 3 ||
           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
          "Expected last argument to be EFLAGS");
   unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   // In the XMM save block, save all the XMM argument registers.
   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
     MachineMemOperand *MMO = F->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
         MachineMemOperand::MOStore,
         /*Size=*/16, /*Align=*/16);
     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
         .addFrameIndex(RegSaveFrameIndex)
         .addImm(/*Scale=*/1)
         .addReg(/*IndexReg=*/0)
         .addImm(/*Disp=*/Offset)
         .addReg(/*Segment=*/0)
         .addReg(MI.getOperand(i).getReg())
         .addMemOperand(MMO);
   }
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
 
   return EndMBB;
 }
 
 // The EFLAGS operand of SelectItr might be missing a kill marker
 // because there were multiple uses of EFLAGS, and ISel didn't know
 // which to mark. Figure out whether SelectItr should have had a
 // kill marker, and set it if it should. Returns the correct kill
 // marker value.
 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
                                      MachineBasicBlock* BB,
                                      const TargetRegisterInfo* TRI) {
   // Scan forward through BB for a use/def of EFLAGS.
   MachineBasicBlock::iterator miI(std::next(SelectItr));
   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
     const MachineInstr& mi = *miI;
     if (mi.readsRegister(X86::EFLAGS))
       return false;
     if (mi.definesRegister(X86::EFLAGS))
       break; // Should have kill-flag - update below.
   }
 
   // If we hit the end of the block, check whether EFLAGS is live into a
   // successor.
   if (miI == BB->end()) {
     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
                                           sEnd = BB->succ_end();
          sItr != sEnd; ++sItr) {
       MachineBasicBlock* succ = *sItr;
       if (succ->isLiveIn(X86::EFLAGS))
         return false;
     }
   }
 
   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   // out. SelectMI should have a kill flag on EFLAGS.
   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
   return true;
 }
 
 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
 // together with other CMOV pseudo-opcodes into a single basic-block with
 // conditional jump around it.
 static bool isCMOVPseudo(MachineInstr &MI) {
   switch (MI.getOpcode()) {
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
   case X86::CMOV_GR8:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
   case X86::CMOV_RFP64:
   case X86::CMOV_RFP80:
   case X86::CMOV_VR128:
   case X86::CMOV_VR128X:
   case X86::CMOV_VR256:
   case X86::CMOV_VR256X:
   case X86::CMOV_VR512:
   case X86::CMOV_VK2:
   case X86::CMOV_VK4:
   case X86::CMOV_VK8:
   case X86::CMOV_VK16:
   case X86::CMOV_VK32:
   case X86::CMOV_VK64:
     return true;
 
   default:
     return false;
   }
 }
 
 // Helper function, which inserts PHI functions into SinkMBB:
 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
 // the last PHI function inserted.
 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
     MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
     MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
     MachineBasicBlock *SinkMBB) {
   MachineFunction *MF = TrueMBB->getParent();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   DebugLoc DL = MIItBegin->getDebugLoc();
 
   X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
 
   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
 
   // As we are creating the PHIs, we have to be careful if there is more than
   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
   // PHIs have to reference the individual true/false inputs from earlier PHIs.
   // That also means that PHI construction must work forward from earlier to
   // later, and that the code must maintain a mapping from earlier PHI's
   // destination registers, and the registers that went into the PHI.
   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
   MachineInstrBuilder MIB;
 
   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
     unsigned DestReg = MIIt->getOperand(0).getReg();
     unsigned Op1Reg = MIIt->getOperand(1).getReg();
     unsigned Op2Reg = MIIt->getOperand(2).getReg();
 
     // If this CMOV we are generating is the opposite condition from
     // the jump we generated, then we have to swap the operands for the
     // PHI that is going to be generated.
     if (MIIt->getOperand(3).getImm() == OppCC)
       std::swap(Op1Reg, Op2Reg);
 
     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
       Op1Reg = RegRewriteTable[Op1Reg].first;
 
     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
       Op2Reg = RegRewriteTable[Op2Reg].second;
 
     MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
               .addReg(Op1Reg)
               .addMBB(FalseMBB)
               .addReg(Op2Reg)
               .addMBB(TrueMBB);
 
     // Add this PHI to the rewrite table.
     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
   }
 
   return MIB;
 }
 
 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
                                              MachineInstr &SecondCascadedCMOV,
                                              MachineBasicBlock *ThisMBB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = FirstCMOV.getDebugLoc();
 
   // We lower cascaded CMOVs such as
   //
   //   (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
   //
   // to two successive branches.
   //
   // Without this, we would add a PHI between the two jumps, which ends up
   // creating a few copies all around. For instance, for
   //
   //    (sitofp (zext (fcmp une)))
   //
   // we would generate:
   //
   //         ucomiss %xmm1, %xmm0
   //         movss  <1.0f>, %xmm0
   //         movaps  %xmm0, %xmm1
   //         jne     .LBB5_2
   //         xorps   %xmm1, %xmm1
   // .LBB5_2:
   //         jp      .LBB5_4
   //         movaps  %xmm1, %xmm0
   // .LBB5_4:
   //         retq
   //
   // because this custom-inserter would have generated:
   //
   //   A
   //   | \
   //   |  B
   //   | /
   //   C
   //   | \
   //   |  D
   //   | /
   //   E
   //
   // A: X = ...; Y = ...
   // B: empty
   // C: Z = PHI [X, A], [Y, B]
   // D: empty
   // E: PHI [X, C], [Z, D]
   //
   // If we lower both CMOVs in a single step, we can instead generate:
   //
   //   A
   //   | \
   //   |  C
   //   | /|
   //   |/ |
   //   |  |
   //   |  D
   //   | /
   //   E
   //
   // A: X = ...; Y = ...
   // D: empty
   // E: PHI [X, A], [X, C], [Y, D]
   //
   // Which, in our sitofp/fcmp example, gives us something like:
   //
   //         ucomiss %xmm1, %xmm0
   //         movss  <1.0f>, %xmm0
   //         jne     .LBB5_4
   //         jp      .LBB5_4
   //         xorps   %xmm0, %xmm0
   // .LBB5_4:
   //         retq
   //
 
   // We lower cascaded CMOV into two successive branches to the same block.
   // EFLAGS is used by both, so mark it as live in the second.
   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
   MachineFunction *F = ThisMBB->getParent();
   MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
 
   MachineFunction::iterator It = ++ThisMBB->getIterator();
   F->insert(It, FirstInsertedMBB);
   F->insert(It, SecondInsertedMBB);
   F->insert(It, SinkMBB);
 
   // For a cascaded CMOV, we lower it to two successive branches to
   // the same block (SinkMBB).  EFLAGS is used by both, so mark it as live in
   // the FirstInsertedMBB.
   FirstInsertedMBB->addLiveIn(X86::EFLAGS);
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
     SecondInsertedMBB->addLiveIn(X86::EFLAGS);
     SinkMBB->addLiveIn(X86::EFLAGS);
   }
 
   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
                   std::next(MachineBasicBlock::iterator(FirstCMOV)),
                   ThisMBB->end());
   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
 
   // Fallthrough block for ThisMBB.
   ThisMBB->addSuccessor(FirstInsertedMBB);
   // The true block target of the first branch is always SinkMBB.
   ThisMBB->addSuccessor(SinkMBB);
   // Fallthrough block for FirstInsertedMBB.
   FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
   // The true block for the branch of FirstInsertedMBB.
   FirstInsertedMBB->addSuccessor(SinkMBB);
   // This is fallthrough.
   SecondInsertedMBB->addSuccessor(SinkMBB);
 
   // Create the conditional branch instructions.
   X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
   unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
   BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
 
   X86::CondCode SecondCC =
       X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
   unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
   BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
 
   //  SinkMBB:
   //   %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
   unsigned DestReg = FirstCMOV.getOperand(0).getReg();
   unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
   unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
   MachineInstrBuilder MIB =
       BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
           .addReg(Op1Reg)
           .addMBB(SecondInsertedMBB)
           .addReg(Op2Reg)
           .addMBB(ThisMBB);
 
   // The second SecondInsertedMBB provides the same incoming value as the
   // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
   MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
   // Copy the PHI result to the register defined by the second CMOV.
   BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
           TII->get(TargetOpcode::COPY),
           SecondCascadedCMOV.getOperand(0).getReg())
       .addReg(FirstCMOV.getOperand(0).getReg());
 
   // Now remove the CMOVs.
   FirstCMOV.eraseFromParent();
   SecondCascadedCMOV.eraseFromParent();
 
   return SinkMBB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
                                      MachineBasicBlock *ThisMBB) const {
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   // To "insert" a SELECT_CC instruction, we actually have to insert the
   // diamond control-flow pattern.  The incoming instruction knows the
   // destination vreg to set, the condition code register to branch on, the
   // true/false values to select between and a branch opcode to use.
 
   //  ThisMBB:
   //  ...
   //   TrueVal = ...
   //   cmpTY ccX, r1, r2
   //   bCC copy1MBB
   //   fallthrough --> FalseMBB
 
   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
   // as described above, by inserting a BB, and then making a PHI at the join
   // point to select the true and false operands of the CMOV in the PHI.
   //
   // The code also handles two different cases of multiple CMOV opcodes
   // in a row.
   //
   // Case 1:
   // In this case, there are multiple CMOVs in a row, all which are based on
   // the same condition setting (or the exact opposite condition setting).
   // In this case we can lower all the CMOVs using a single inserted BB, and
   // then make a number of PHIs at the join point to model the CMOVs. The only
   // trickiness here, is that in a case like:
   //
   // t2 = CMOV cond1 t1, f1
   // t3 = CMOV cond1 t2, f2
   //
   // when rewriting this into PHIs, we have to perform some renaming on the
   // temps since you cannot have a PHI operand refer to a PHI result earlier
   // in the same block.  The "simple" but wrong lowering would be:
   //
   // t2 = PHI t1(BB1), f1(BB2)
   // t3 = PHI t2(BB1), f2(BB2)
   //
   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
   // renaming is to note that on the path through BB1, t2 is really just a
   // copy of t1, and do that renaming, properly generating:
   //
   // t2 = PHI t1(BB1), f1(BB2)
   // t3 = PHI t1(BB1), f2(BB2)
   //
   // Case 2:
   // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
   // function - EmitLoweredCascadedSelect.
 
   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
   MachineInstr *LastCMOV = &MI;
   MachineBasicBlock::iterator NextMIIt =
       std::next(MachineBasicBlock::iterator(MI));
 
   // Check for case 1, where there are multiple CMOVs with the same condition
   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
   // number of jumps the most.
 
   if (isCMOVPseudo(MI)) {
     // See if we have a string of CMOVS with the same condition.
     while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
            (NextMIIt->getOperand(3).getImm() == CC ||
             NextMIIt->getOperand(3).getImm() == OppCC)) {
       LastCMOV = &*NextMIIt;
       ++NextMIIt;
     }
   }
 
   // This checks for case 2, but only do this if we didn't already find
   // case 1, as indicated by LastCMOV == MI.
   if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
       NextMIIt->getOpcode() == MI.getOpcode() &&
       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
       NextMIIt->getOperand(1).isKill()) {
     return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
   }
 
   const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
   MachineFunction *F = ThisMBB->getParent();
   MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
 
   MachineFunction::iterator It = ++ThisMBB->getIterator();
   F->insert(It, FalseMBB);
   F->insert(It, SinkMBB);
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   if (!LastCMOV->killsRegister(X86::EFLAGS) &&
       !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
     FalseMBB->addLiveIn(X86::EFLAGS);
     SinkMBB->addLiveIn(X86::EFLAGS);
   }
 
   // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
   SinkMBB->splice(SinkMBB->begin(), ThisMBB,
                   std::next(MachineBasicBlock::iterator(LastCMOV)),
                   ThisMBB->end());
   SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
 
   // Fallthrough block for ThisMBB.
   ThisMBB->addSuccessor(FalseMBB);
   // The true block target of the first (or only) branch is always a SinkMBB.
   ThisMBB->addSuccessor(SinkMBB);
   // Fallthrough block for FalseMBB.
   FalseMBB->addSuccessor(SinkMBB);
 
   // Create the conditional branch instruction.
   unsigned Opc = X86::GetCondBranchFromCond(CC);
   BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
 
   //  SinkMBB:
   //   %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
   //  ...
   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
   MachineBasicBlock::iterator MIItEnd =
       std::next(MachineBasicBlock::iterator(LastCMOV));
   createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
 
   // Now remove the CMOV(s).
   ThisMBB->erase(MIItBegin, MIItEnd);
 
   return SinkMBB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   // Combine the following atomic floating-point modification pattern:
   //   a.store(reg OP a.load(acquire), release)
   // Transform them into:
   //   OPss (%gpr), %xmm
   //   movss %xmm, (%gpr)
   // Or sd equivalent for 64-bit operations.
   unsigned MOp, FOp;
   switch (MI.getOpcode()) {
   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
   case X86::RELEASE_FADD32mr:
     FOp = X86::ADDSSrm;
     MOp = X86::MOVSSmr;
     break;
   case X86::RELEASE_FADD64mr:
     FOp = X86::ADDSDrm;
     MOp = X86::MOVSDmr;
     break;
   }
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
   unsigned ValOpIdx = X86::AddrNumOperands;
   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
   MachineInstrBuilder MIB =
       BuildMI(*BB, MI, DL, TII->get(FOp),
               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
           .addReg(VSrc);
   for (int i = 0; i < X86::AddrNumOperands; ++i) {
     MachineOperand &Operand = MI.getOperand(i);
     // Clear any kill flags on register operands as we'll create a second
     // instruction using the same address operands.
     if (Operand.isReg())
       Operand.setIsKill(false);
     MIB.add(Operand);
   }
   MachineInstr *FOpMI = MIB;
   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
   for (int i = 0; i < X86::AddrNumOperands; ++i)
     MIB.add(MI.getOperand(i));
   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
   assert(MF->shouldSplitStack());
 
   const bool Is64Bit = Subtarget.is64Bit();
   const bool IsLP64 = Subtarget.isTarget64BitLP64();
 
   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
 
   // BB:
   //  ... [Till the alloca]
   // If stacklet is not large enough, jump to mallocMBB
   //
   // bumpMBB:
   //  Allocate by subtracting from RSP
   //  Jump to continueMBB
   //
   // mallocMBB:
   //  Allocate by call to runtime
   //
   // continueMBB:
   //  ...
   //  [rest of original BB]
   //
 
   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
 
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetRegisterClass *AddrRegClass =
       getRegClassFor(getPointerTy(MF->getDataLayout()));
 
   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
            sizeVReg = MI.getOperand(1).getReg(),
            physSPReg =
                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
 
   MachineFunction::iterator MBBIter = ++BB->getIterator();
 
   MF->insert(MBBIter, bumpMBB);
   MF->insert(MBBIter, mallocMBB);
   MF->insert(MBBIter, continueMBB);
 
   continueMBB->splice(continueMBB->begin(), BB,
                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Add code to the main basic block to check if the stack limit has been hit,
   // and if so, jump to mallocMBB otherwise to bumpMBB.
   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
     .addReg(tmpSPVReg).addReg(sizeVReg);
   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
     .addReg(SPLimitVReg);
   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
 
   // bumpMBB simply decreases the stack pointer, since we know the current
   // stacklet has enough space.
   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
     .addReg(SPLimitVReg);
   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
     .addReg(SPLimitVReg);
   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Calls into a routine in libgcc to allocate more space from the heap.
   const uint32_t *RegMask =
       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
   if (IsLP64) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
       .addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::RDI, RegState::Implicit)
       .addReg(X86::RAX, RegState::ImplicitDefine);
   } else if (Is64Bit) {
     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
       .addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::EDI, RegState::Implicit)
       .addReg(X86::EAX, RegState::ImplicitDefine);
   } else {
     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
       .addImm(12);
     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
       .addExternalSymbol("__morestack_allocate_stack_space")
       .addRegMask(RegMask)
       .addReg(X86::EAX, RegState::ImplicitDefine);
   }
 
   if (!Is64Bit)
     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
       .addImm(16);
 
   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
     .addReg(IsLP64 ? X86::RAX : X86::EAX);
   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
 
   // Set up the CFG correctly.
   BB->addSuccessor(bumpMBB);
   BB->addSuccessor(mallocMBB);
   mallocMBB->addSuccessor(continueMBB);
   bumpMBB->addSuccessor(continueMBB);
 
   // Take care of the PHI nodes.
   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
           MI.getOperand(0).getReg())
       .addReg(mallocPtrVReg)
       .addMBB(mallocMBB)
       .addReg(bumpSPPtrVReg)
       .addMBB(bumpMBB);
 
   // Delete the original pseudo instruction.
   MI.eraseFromParent();
 
   // And we're done.
   return continueMBB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
   DebugLoc DL = MI.getDebugLoc();
 
   assert(!isAsynchronousEHPersonality(
              classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
          "SEH does not use catchret!");
 
   // Only 32-bit EH needs to worry about manually restoring stack pointers.
   if (!Subtarget.is32Bit())
     return BB;
 
   // C++ EH creates a new target block to hold the restore code, and wires up
   // the new block to the return destination with a normal JMP_4.
   MachineBasicBlock *RestoreMBB =
       MF->CreateMachineBasicBlock(BB->getBasicBlock());
   assert(BB->succ_size() == 1);
   MF->insert(std::next(BB->getIterator()), RestoreMBB);
   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
   BB->addSuccessor(RestoreMBB);
   MI.getOperand(0).setMBB(RestoreMBB);
 
   auto RestoreMBBI = RestoreMBB->begin();
   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
                                        MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const Constant *PerFn = MF->getFunction().getPersonalityFn();
   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
   // Only 32-bit SEH requires special handling for catchpad.
   if (IsSEH && Subtarget.is32Bit()) {
     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
     DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
   }
   MI.eraseFromParent();
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
                                       MachineBasicBlock *BB) const {
   // So, here we replace TLSADDR with the sequence:
   // adjust_stackdown -> TLSADDR -> adjust_stackup.
   // We need this because TLSADDR is lowered into calls
   // inside MC, therefore without the two markers shrink-wrapping
   // may push the prologue/epilogue pass them.
   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction &MF = *BB->getParent();
 
   // Emit CALLSEQ_START right before the instruction.
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   MachineInstrBuilder CallseqStart =
     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
 
   // Emit CALLSEQ_END right after the instruction.
   // We don't call erase from parent because we want to keep the
   // original instruction around.
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   MachineInstrBuilder CallseqEnd =
     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
 
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
                                       MachineBasicBlock *BB) const {
   // This is pretty easy.  We're taking the value that we received from
   // our load from the relocation, sticking it in either RDI (x86-64)
   // or EAX and doing an indirect call.  The return value will then
   // be in the normal return register.
   MachineFunction *F = BB->getParent();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
   assert(MI.getOperand(3).isGlobal() && "This should be a global");
 
   // Get a register mask for the lowered call.
   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
   // proper register mask.
   const uint32_t *RegMask =
       Subtarget.is64Bit() ?
       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
   if (Subtarget.is64Bit()) {
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
             .addReg(X86::RIP)
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
     addDirectMem(MIB, X86::RDI);
     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else if (!isPositionIndependent()) {
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
             .addReg(0)
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   } else {
     MachineInstrBuilder MIB =
         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
             .addReg(TII->getGlobalBaseReg(F))
             .addImm(0)
             .addReg(0)
             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
                               MI.getOperand(3).getTargetFlags())
             .addReg(0);
     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
     addDirectMem(MIB, X86::EAX);
     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
   }
 
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return BB;
 }
 
 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
   switch (RPOpc) {
   case X86::RETPOLINE_CALL32:
     return X86::CALLpcrel32;
   case X86::RETPOLINE_CALL64:
     return X86::CALL64pcrel32;
   case X86::RETPOLINE_TCRETURN32:
     return X86::TCRETURNdi;
   case X86::RETPOLINE_TCRETURN64:
     return X86::TCRETURNdi64;
   }
   llvm_unreachable("not retpoline opcode");
 }
 
 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
                                       unsigned Reg) {
   if (Subtarget.useRetpolineExternalThunk()) {
     // When using an external thunk for retpolines, we pick names that match the
     // names GCC happens to use as well. This helps simplify the implementation
     // of the thunks for kernels where they have no easy ability to create
     // aliases and are doing non-trivial configuration of the thunk's body. For
     // example, the Linux kernel will do boot-time hot patching of the thunk
     // bodies and cannot easily export aliases of these to loaded modules.
     //
     // Note that at any point in the future, we may need to change the semantics
     // of how we implement retpolines and at that time will likely change the
     // name of the called thunk. Essentially, there is no hard guarantee that
     // LLVM will generate calls to specific thunks, we merely make a best-effort
     // attempt to help out kernels and other systems where duplicating the
     // thunks is costly.
     switch (Reg) {
     case X86::EAX:
       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
       return "__x86_indirect_thunk_eax";
     case X86::ECX:
       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
       return "__x86_indirect_thunk_ecx";
     case X86::EDX:
       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
       return "__x86_indirect_thunk_edx";
     case X86::EDI:
       assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
       return "__x86_indirect_thunk_edi";
     case X86::R11:
       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
       return "__x86_indirect_thunk_r11";
     }
     llvm_unreachable("unexpected reg for retpoline");
   }
 
   // When targeting an internal COMDAT thunk use an LLVM-specific name.
   switch (Reg) {
   case X86::EAX:
     assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
     return "__llvm_retpoline_eax";
   case X86::ECX:
     assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
     return "__llvm_retpoline_ecx";
   case X86::EDX:
     assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
     return "__llvm_retpoline_edx";
   case X86::EDI:
     assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
     return "__llvm_retpoline_edi";
   case X86::R11:
     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
     return "__llvm_retpoline_r11";
   }
   llvm_unreachable("unexpected reg for retpoline");
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
                                         MachineBasicBlock *BB) const {
   // Copy the virtual register into the R11 physical register and
   // call the retpoline thunk.
   DebugLoc DL = MI.getDebugLoc();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   unsigned CalleeVReg = MI.getOperand(0).getReg();
   unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
 
   // Find an available scratch register to hold the callee. On 64-bit, we can
   // just use R11, but we scan for uses anyway to ensure we don't generate
   // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
   // already a register use operand to the call to hold the callee. If none
   // are available, use EDI instead. EDI is chosen because EBX is the PIC base
   // register and ESI is the base pointer to realigned stack frames with VLAs.
   SmallVector<unsigned, 3> AvailableRegs;
   if (Subtarget.is64Bit())
     AvailableRegs.push_back(X86::R11);
   else
     AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
 
   // Zero out any registers that are already used.
   for (const auto &MO : MI.operands()) {
     if (MO.isReg() && MO.isUse())
       for (unsigned &Reg : AvailableRegs)
         if (Reg == MO.getReg())
           Reg = 0;
   }
 
   // Choose the first remaining non-zero available register.
   unsigned AvailableReg = 0;
   for (unsigned MaybeReg : AvailableRegs) {
     if (MaybeReg) {
       AvailableReg = MaybeReg;
       break;
     }
   }
   if (!AvailableReg)
     report_fatal_error("calling convention incompatible with retpoline, no "
                        "available registers");
 
   const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
 
   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
       .addReg(CalleeVReg);
   MI.getOperand(0).ChangeToES(Symbol);
   MI.setDesc(TII->get(Opc));
   MachineInstrBuilder(*BB->getParent(), &MI)
       .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
   return BB;
 }
 
 /// SetJmp implies future control flow change upon calling the corresponding
 /// LongJmp.
 /// Instead of using the 'return' instruction, the long jump fixes the stack and
 /// performs an indirect branch. To do so it uses the registers that were stored
 /// in the jump buffer (when calling SetJmp).
 /// In case the shadow stack is enabled we need to fix it as well, because some
 /// return addresses will be skipped.
 /// The function will save the SSP for future fixing in the function
 /// emitLongJmpShadowStackFix.
 /// \sa emitLongJmpShadowStackFix
 /// \param [in] MI The temporary Machine Instruction for the builtin.
 /// \param [in] MBB The Machine Basic Block that will be modified.
 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineInstrBuilder MIB;
 
   // Memory Reference.
   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
                                            MI.memoperands_end());
 
   // Initialize a register with zero.
   MVT PVT = getPointerTy(MF->getDataLayout());
   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
   unsigned ZReg = MRI.createVirtualRegister(PtrRC);
   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
   BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
       .addDef(ZReg)
       .addReg(ZReg, RegState::Undef)
       .addReg(ZReg, RegState::Undef);
 
   // Read the current SSP Register value to the zeroed register.
   unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
   BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
 
   // Write the SSP register value to offset 3 in input memory buffer.
   unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
   const int64_t SSPOffset = 3 * PVT.getStoreSize();
   const unsigned MemOpndSlot = 1;
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
     else
       MIB.add(MI.getOperand(MemOpndSlot + i));
   }
   MIB.addReg(SSPCopyReg);
   MIB.setMemRefs(MMOs);
 }
 
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = ++MBB->getIterator();
 
   // Memory Reference
   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
                                            MI.memoperands_end());
 
   unsigned DstReg;
   unsigned MemOpndSlot = 0;
 
   unsigned CurOp = 0;
 
   DstReg = MI.getOperand(CurOp++).getReg();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   (void)TRI;
   unsigned mainDstReg = MRI.createVirtualRegister(RC);
   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
 
   MemOpndSlot = CurOp;
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
   // For v = setjmp(buf), we generate
   //
   // thisMBB:
   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
   //  SjLjSetup restoreMBB
   //
   // mainMBB:
   //  v_main = 0
   //
   // sinkMBB:
   //  v = phi(main, restore)
   //
   // restoreMBB:
   //  if base pointer being used, load it from frame
   //  v_restore = 1
 
   MachineBasicBlock *thisMBB = MBB;
   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, mainMBB);
   MF->insert(I, sinkMBB);
   MF->push_back(restoreMBB);
   restoreMBB->setHasAddressTaken();
 
   MachineInstrBuilder MIB;
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   // thisMBB:
   unsigned PtrStoreOpc = 0;
   unsigned LabelReg = 0;
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
                      !isPositionIndependent();
 
   // Prepare IP either in reg or imm.
   if (!UseImmLabel) {
     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
     LabelReg = MRI.createVirtualRegister(PtrRC);
     if (Subtarget.is64Bit()) {
       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
               .addReg(X86::RIP)
               .addImm(0)
               .addReg(0)
               .addMBB(restoreMBB)
               .addReg(0);
     } else {
       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
               .addReg(XII->getGlobalBaseReg(MF))
               .addImm(0)
               .addReg(0)
               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
               .addReg(0);
     }
   } else
     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   // Store IP
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
     else
       MIB.add(MI.getOperand(MemOpndSlot + i));
   }
   if (!UseImmLabel)
     MIB.addReg(LabelReg);
   else
     MIB.addMBB(restoreMBB);
   MIB.setMemRefs(MMOs);
 
   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
     emitSetJmpShadowStackFix(MI, thisMBB);
   }
 
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
 
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
 
   // mainMBB:
   //  EAX = 0
   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
   mainMBB->addSuccessor(sinkMBB);
 
   // sinkMBB:
   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
           TII->get(X86::PHI), DstReg)
     .addReg(mainDstReg).addMBB(mainMBB)
     .addReg(restoreDstReg).addMBB(restoreMBB);
 
   // restoreMBB:
   if (RegInfo->hasBasePointer(*MF)) {
     const bool Uses64BitFramePtr =
         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
     X86FI->setRestoreBasePointer(MF);
     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
     unsigned BasePtr = RegInfo->getBaseRegister();
     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
       .setMIFlag(MachineInstr::FrameSetup);
   }
   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
   restoreMBB->addSuccessor(sinkMBB);
 
   MI.eraseFromParent();
   return sinkMBB;
 }
 
 /// Fix the shadow stack using the previously saved SSP pointer.
 /// \sa emitSetJmpShadowStackFix
 /// \param [in] MI The temporary Machine Instruction for the builtin.
 /// \param [in] MBB The Machine Basic Block that will be modified.
 /// \return The sink MBB that will perform the future indirect branch.
 MachineBasicBlock *
 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
                                              MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
                                            MI.memoperands_end());
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
 
   // checkSspMBB:
   //         xor vreg1, vreg1
   //         rdssp vreg1
   //         test vreg1, vreg1
   //         je sinkMBB   # Jump if Shadow Stack is not supported
   // fallMBB:
   //         mov buf+24/12(%rip), vreg2
   //         sub vreg1, vreg2
   //         jbe sinkMBB  # No need to fix the Shadow Stack
   // fixShadowMBB:
   //         shr 3/2, vreg2
   //         incssp vreg2  # fix the SSP according to the lower 8 bits
   //         shr 8, vreg2
   //         je sinkMBB
   // fixShadowLoopPrepareMBB:
   //         shl vreg2
   //         mov 128, vreg3
   // fixShadowLoopMBB:
   //         incssp vreg3
   //         dec vreg2
   //         jne fixShadowLoopMBB # Iterate until you finish fixing
   //                              # the Shadow Stack
   // sinkMBB:
 
   MachineFunction::iterator I = ++MBB->getIterator();
   const BasicBlock *BB = MBB->getBasicBlock();
 
   MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
   MF->insert(I, checkSspMBB);
   MF->insert(I, fallMBB);
   MF->insert(I, fixShadowMBB);
   MF->insert(I, fixShadowLoopPrepareMBB);
   MF->insert(I, fixShadowLoopMBB);
   MF->insert(I, sinkMBB);
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
                   MBB->end());
   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   MBB->addSuccessor(checkSspMBB);
 
   // Initialize a register with zero.
   unsigned ZReg = MRI.createVirtualRegister(PtrRC);
   unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
   BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
       .addDef(ZReg)
       .addReg(ZReg, RegState::Undef)
       .addReg(ZReg, RegState::Undef);
 
   // Read the current SSP Register value to the zeroed register.
   unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
   unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
   BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
 
   // Check whether the result of the SSP register is zero and jump directly
   // to the sink.
   unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
   BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
       .addReg(SSPCopyReg)
       .addReg(SSPCopyReg);
   BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
   checkSspMBB->addSuccessor(sinkMBB);
   checkSspMBB->addSuccessor(fallMBB);
 
   // Reload the previously saved SSP register value.
   unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   const int64_t SPPOffset = 3 * PVT.getStoreSize();
   MachineInstrBuilder MIB =
       BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (i == X86::AddrDisp)
       MIB.addDisp(MO, SPPOffset);
     else if (MO.isReg()) // Don't add the whole operand, we don't want to
                          // preserve kill flags.
       MIB.addReg(MO.getReg());
     else
       MIB.add(MO);
   }
   MIB.setMemRefs(MMOs);
 
   // Subtract the current SSP from the previous SSP.
   unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
   unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
   BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
       .addReg(PrevSSPReg)
       .addReg(SSPCopyReg);
 
   // Jump to sink in case PrevSSPReg <= SSPCopyReg.
   BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
   fallMBB->addSuccessor(sinkMBB);
   fallMBB->addSuccessor(fixShadowMBB);
 
   // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
   unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
   unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
   unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
       .addReg(SspSubReg)
       .addImm(Offset);
 
   // Increase SSP when looking only on the lower 8 bits of the delta.
   unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
   BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
 
   // Reset the lower 8 bits.
   unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
   BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
       .addReg(SspFirstShrReg)
       .addImm(8);
 
   // Jump if the result of the shift is zero.
   BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
   fixShadowMBB->addSuccessor(sinkMBB);
   fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
 
   // Do a single shift left.
   unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
   unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
       .addReg(SspSecondShrReg);
 
   // Save the value 128 to a register (will be used next with incssp).
   unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
   unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
   BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
       .addImm(128);
   fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
 
   // Since incssp only looks at the lower 8 bits, we might need to do several
   // iterations of incssp until we finish fixing the shadow stack.
   unsigned DecReg = MRI.createVirtualRegister(PtrRC);
   unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
       .addReg(SspAfterShlReg)
       .addMBB(fixShadowLoopPrepareMBB)
       .addReg(DecReg)
       .addMBB(fixShadowLoopMBB);
 
   // Every iteration we increase the SSP by 128.
   BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
 
   // Every iteration we decrement the counter by 1.
   unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
   BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
 
   // Jump if the counter is not zero yet.
   BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
   fixShadowLoopMBB->addSuccessor(sinkMBB);
   fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
 
   return sinkMBB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
                                      MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
   // Memory Reference
   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
                                            MI.memoperands_end());
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
          "Invalid Pointer Size!");
 
   const TargetRegisterClass *RC =
     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
   MachineInstrBuilder MIB;
 
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   const int64_t SPOffset = 2 * PVT.getStoreSize();
 
   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
 
   MachineBasicBlock *thisMBB = MBB;
 
   // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
   if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
     thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
   }
 
   // Reload FP
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg()) // Don't add the whole operand, we don't want to
                     // preserve kill flags.
       MIB.addReg(MO.getReg());
     else
       MIB.add(MO);
   }
   MIB.setMemRefs(MMOs);
 
   // Reload IP
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (i == X86::AddrDisp)
       MIB.addDisp(MO, LabelOffset);
     else if (MO.isReg()) // Don't add the whole operand, we don't want to
                          // preserve kill flags.
       MIB.addReg(MO.getReg());
     else
       MIB.add(MO);
   }
   MIB.setMemRefs(MMOs);
 
   // Reload SP
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
     if (i == X86::AddrDisp)
       MIB.addDisp(MI.getOperand(i), SPOffset);
     else
       MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
                                  // the last instruction of the expansion.
   }
   MIB.setMemRefs(MMOs);
 
   // Jump
   BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
 
   MI.eraseFromParent();
   return thisMBB;
 }
 
 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
                                                MachineBasicBlock *MBB,
                                                MachineBasicBlock *DispatchBB,
                                                int FI) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
 
   unsigned Op = 0;
   unsigned VR = 0;
 
   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
                      !isPositionIndependent();
 
   if (UseImmLabel) {
     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
   } else {
     const TargetRegisterClass *TRC =
         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
     VR = MRI->createVirtualRegister(TRC);
     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
 
     if (Subtarget.is64Bit())
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
           .addReg(X86::RIP)
           .addImm(1)
           .addReg(0)
           .addMBB(DispatchBB)
           .addReg(0);
     else
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
           .addReg(0) /* TII->getGlobalBaseReg(MF) */
           .addImm(1)
           .addReg(0)
           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
           .addReg(0);
   }
 
   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
   addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
   if (UseImmLabel)
     MIB.addMBB(DispatchBB);
   else
     MIB.addReg(VR);
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
                                          MachineBasicBlock *BB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   int FI = MFI.getFunctionContextIndex();
 
   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
   for (auto &MBB : *MF) {
     if (!MBB.isEHPad())
       continue;
 
     MCSymbol *Sym = nullptr;
     for (const auto &MI : MBB) {
       if (MI.isDebugInstr())
         continue;
 
       assert(MI.isEHLabel() && "expected EH_LABEL");
       Sym = MI.getOperand(0).getMCSymbol();
       break;
     }
 
     if (!MF->hasCallSiteLandingPad(Sym))
       continue;
 
     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
       CallSiteNumToLPad[CSI].push_back(&MBB);
       MaxCSNum = std::max(MaxCSNum, CSI);
     }
   }
 
   // Get an ordered list of the machine basic blocks for the jump table.
   std::vector<MachineBasicBlock *> LPadList;
   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
   LPadList.reserve(CallSiteNumToLPad.size());
 
   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
     for (auto &LP : CallSiteNumToLPad[CSI]) {
       LPadList.push_back(LP);
       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
     }
   }
 
   assert(!LPadList.empty() &&
          "No landing pad destinations for the dispatch jump table!");
 
   // Create the MBBs for the dispatch code.
 
   // Shove the dispatch's address into the return slot in the function context.
   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
   DispatchBB->setIsEHPad(true);
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
   DispatchBB->addSuccessor(TrapBB);
 
   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
   DispatchBB->addSuccessor(DispContBB);
 
   // Insert MBBs.
   MF->push_back(DispatchBB);
   MF->push_back(DispContBB);
   MF->push_back(TrapBB);
 
   // Insert code into the entry block that creates and registers the function
   // context.
   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
 
   // Create the jump table and associated information
   unsigned JTE = getJumpTableEncoding();
   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
 
   const X86RegisterInfo &RI = TII->getRegisterInfo();
   // Add a register mask with no preserved registers.  This results in all
   // registers being marked as clobbered.
   if (RI.hasBasePointer(*MF)) {
     const bool FPIs64Bit =
         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
     MFI->setRestoreBasePointer(MF);
 
     unsigned FP = RI.getFrameRegister(*MF);
     unsigned BP = RI.getBaseRegister();
     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
                  MFI->getRestoreBasePointerOffset())
         .addRegMask(RI.getNoPreservedMask());
   } else {
     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
         .addRegMask(RI.getNoPreservedMask());
   }
 
   // IReg is used as an index in a memory operand and therefore can't be SP
   unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
                     Subtarget.is64Bit() ? 8 : 4);
   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
       .addReg(IReg)
       .addImm(LPadList.size());
   BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
 
   if (Subtarget.is64Bit()) {
     unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
     unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
 
     // leaq .LJTI0_0(%rip), BReg
     BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
         .addReg(X86::RIP)
         .addImm(1)
         .addReg(0)
         .addJumpTableIndex(MJTI)
         .addReg(0);
     // movzx IReg64, IReg
     BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
         .addImm(0)
         .addReg(IReg)
         .addImm(X86::sub_32bit);
 
     switch (JTE) {
     case MachineJumpTableInfo::EK_BlockAddress:
       // jmpq *(BReg,IReg64,8)
       BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
           .addReg(BReg)
           .addImm(8)
           .addReg(IReg64)
           .addImm(0)
           .addReg(0);
       break;
     case MachineJumpTableInfo::EK_LabelDifference32: {
       unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
       unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
       unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
 
       // movl (BReg,IReg64,4), OReg
       BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
           .addReg(BReg)
           .addImm(4)
           .addReg(IReg64)
           .addImm(0)
           .addReg(0);
       // movsx OReg64, OReg
       BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
       // addq BReg, OReg64, TReg
       BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
           .addReg(OReg64)
           .addReg(BReg);
       // jmpq *TReg
       BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
       break;
     }
     default:
       llvm_unreachable("Unexpected jump table encoding");
     }
   } else {
     // jmpl *.LJTI0_0(,IReg,4)
     BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
         .addReg(0)
         .addImm(4)
         .addReg(IReg)
         .addJumpTableIndex(MJTI)
         .addReg(0);
   }
 
   // Add the jump table entries as successors to the MBB.
   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
   for (auto &LP : LPadList)
     if (SeenMBBs.insert(LP).second)
       DispContBB->addSuccessor(LP);
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
   SmallVector<MachineBasicBlock *, 64> MBBLPads;
   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
   for (MachineBasicBlock *MBB : InvokeBBs) {
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
     // Keep a copy of Successors since it's modified inside the loop.
     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
                                                    MBB->succ_rend());
     // FIXME: Avoid quadratic complexity.
     for (auto MBBS : Successors) {
       if (MBBS->isEHPad()) {
         MBB->removeSuccessor(MBBS);
         MBBLPads.push_back(MBBS);
       }
     }
 
     MBB->addSuccessor(DispatchBB);
 
     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled.  This prevents code from
     // moving instructions to before the EH block, where they will never be
     // executed.
     for (auto &II : reverse(*MBB)) {
       if (!II.isCall())
         continue;
 
       DenseMap<unsigned, bool> DefRegs;
       for (auto &MOp : II.operands())
         if (MOp.isReg())
           DefRegs[MOp.getReg()] = true;
 
       MachineInstrBuilder MIB(*MF, &II);
       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
         unsigned Reg = SavedRegs[RI];
         if (!DefRegs[Reg])
           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
       }
 
       break;
     }
   }
 
   // Mark all former landing pads as non-landing pads.  The dispatch is the only
   // landing pad now.
   for (auto &LP : MBBLPads)
     LP->setIsEHPad(false);
 
   // The instruction is gone now.
   MI.eraseFromParent();
   return BB;
 }
 
 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   switch (MI.getOpcode()) {
   default: llvm_unreachable("Unexpected instr type to insert");
   case X86::TLS_addr32:
   case X86::TLS_addr64:
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
     return EmitLoweredTLSAddr(MI, BB);
   case X86::RETPOLINE_CALL32:
   case X86::RETPOLINE_CALL64:
   case X86::RETPOLINE_TCRETURN32:
   case X86::RETPOLINE_TCRETURN64:
     return EmitLoweredRetpoline(MI, BB);
   case X86::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
   case X86::CATCHPAD:
     return EmitLoweredCatchPad(MI, BB);
   case X86::SEG_ALLOCA_32:
   case X86::SEG_ALLOCA_64:
     return EmitLoweredSegAlloca(MI, BB);
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
   case X86::CMOV_GR8:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
   case X86::CMOV_RFP64:
   case X86::CMOV_RFP80:
   case X86::CMOV_VR128:
   case X86::CMOV_VR128X:
   case X86::CMOV_VR256:
   case X86::CMOV_VR256X:
   case X86::CMOV_VR512:
   case X86::CMOV_VK2:
   case X86::CMOV_VK4:
   case X86::CMOV_VK8:
   case X86::CMOV_VK16:
   case X86::CMOV_VK32:
   case X86::CMOV_VK64:
     return EmitLoweredSelect(MI, BB);
 
   case X86::RDFLAGS32:
   case X86::RDFLAGS64: {
     unsigned PushF =
         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
     // Permit reads of the EFLAGS and DF registers without them being defined.
     // This intrinsic exists to read external processor state in flags, such as
     // the trap flag, interrupt flag, and direction flag, none of which are
     // modeled by the backend.
     assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
            "Unexpected register in operand!");
     Push->getOperand(2).setIsUndef();
     assert(Push->getOperand(3).getReg() == X86::DF &&
            "Unexpected register in operand!");
     Push->getOperand(3).setIsUndef();
     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
 
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
 
   case X86::WRFLAGS32:
   case X86::WRFLAGS64: {
     unsigned Push =
         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
     unsigned PopF =
         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
     BuildMI(*BB, MI, DL, TII->get(PopF));
 
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
 
   case X86::RELEASE_FADD32mr:
   case X86::RELEASE_FADD64mr:
     return EmitLoweredAtomicFP(MI, BB);
 
   case X86::FP32_TO_INT16_IN_MEM:
   case X86::FP32_TO_INT32_IN_MEM:
   case X86::FP32_TO_INT64_IN_MEM:
   case X86::FP64_TO_INT16_IN_MEM:
   case X86::FP64_TO_INT32_IN_MEM:
   case X86::FP64_TO_INT64_IN_MEM:
   case X86::FP80_TO_INT16_IN_MEM:
   case X86::FP80_TO_INT32_IN_MEM:
   case X86::FP80_TO_INT64_IN_MEM: {
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
     int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
 
     // Load the old value of the high byte of the control word...
     unsigned OldCW =
       MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                       CWFrameIdx);
 
     // Set the high part to be round to zero...
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
       .addImm(0xC7F);
 
     // Reload the modified control word now...
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FLDCW16m)), CWFrameIdx);
 
     // Restore the memory image of control word to original value
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
       .addReg(OldCW);
 
     // Get the X86 opcode to use.
     unsigned Opc;
     switch (MI.getOpcode()) {
     default: llvm_unreachable("illegal opcode!");
     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
     }
 
     X86AddressMode AM = getAddressFromInstr(&MI, 0);
     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
 
     // Reload the original control word now.
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FLDCW16m)), CWFrameIdx);
 
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
   // Thread synchronization.
   case X86::MONITOR:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
   case X86::MONITORX:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
 
   // Cache line zero
   case X86::CLZERO:
     return emitClzero(&MI, BB, Subtarget);
 
   // PKU feature
   case X86::WRPKRU:
     return emitWRPKRU(MI, BB, Subtarget);
   case X86::RDPKRU:
     return emitRDPKRU(MI, BB, Subtarget);
   // xbegin
   case X86::XBEGIN:
     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
 
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
 
   case X86::VAARG_64:
     return EmitVAARG64WithCustomInserter(MI, BB);
 
   case X86::EH_SjLj_SetJmp32:
   case X86::EH_SjLj_SetJmp64:
     return emitEHSjLjSetJmp(MI, BB);
 
   case X86::EH_SjLj_LongJmp32:
   case X86::EH_SjLj_LongJmp64:
     return emitEHSjLjLongJmp(MI, BB);
 
   case X86::Int_eh_sjlj_setup_dispatch:
     return EmitSjLjDispatchBlock(MI, BB);
 
   case TargetOpcode::STATEPOINT:
     // As an implementation detail, STATEPOINT shares the STACKMAP format at
     // this point in the process.  We diverge later.
     return emitPatchPoint(MI, BB);
 
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
 
   case TargetOpcode::PATCHABLE_EVENT_CALL:
     return emitXRayCustomEvent(MI, BB);
 
   case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
     return emitXRayTypedEvent(MI, BB);
 
   case X86::LCMPXCHG8B: {
     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
     // requires a memory operand. If it happens that current architecture is
     // i686 and for current function we need a base pointer
     // - which is ESI for i686 - register allocator would not be able to
     // allocate registers for an address in form of X(%reg, %reg, Y)
     // - there never would be enough unreserved registers during regalloc
     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
     // We are giving a hand to register allocator by precomputing the address in
     // a new vreg using LEA.
 
     // If it is not i686 or there is no base pointer - nothing to do here.
     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
       return BB;
 
     // Even though this code does not necessarily needs the base pointer to
     // be ESI, we check for that. The reason: if this assert fails, there are
     // some changes happened in the compiler base pointer handling, which most
     // probably have to be addressed somehow here.
     assert(TRI->getBaseRegister() == X86::ESI &&
            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
            "base pointer in mind");
 
     MachineRegisterInfo &MRI = MF->getRegInfo();
     MVT SPTy = getPointerTy(MF->getDataLayout());
     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
     unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
 
     X86AddressMode AM = getAddressFromInstr(&MI, 0);
     // Regalloc does not need any help when the memory operand of CMPXCHG8B
     // does not use index register.
     if (AM.IndexReg == X86::NoRegister)
       return BB;
 
     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
     // four operand definitions that are E[ABCD] registers. We skip them and
     // then insert the LEA.
     MachineBasicBlock::iterator MBBI(MI);
     while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
            MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
       --MBBI;
     addFullAddress(
         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
 
     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
 
     return BB;
   }
   case X86::LCMPXCHG16B:
     return BB;
   case X86::LCMPXCHG8B_SAVE_EBX:
   case X86::LCMPXCHG16B_SAVE_RBX: {
     unsigned BasePtr =
         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
     if (!BB->isLiveIn(BasePtr))
       BB->addLiveIn(BasePtr);
     return BB;
   }
   }
 }
 
 //===----------------------------------------------------------------------===//
 //                           X86 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
 bool
 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
                                                 const APInt &Demanded,
                                                 TargetLoweringOpt &TLO) const {
   // Only optimize Ands to prevent shrinking a constant that could be
   // matched by movzx.
   if (Op.getOpcode() != ISD::AND)
     return false;
 
   EVT VT = Op.getValueType();
 
   // Ignore vectors.
   if (VT.isVector())
     return false;
 
   unsigned Size = VT.getSizeInBits();
 
   // Make sure the RHS really is a constant.
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   if (!C)
     return false;
 
   const APInt &Mask = C->getAPIntValue();
 
   // Clear all non-demanded bits initially.
   APInt ShrunkMask = Mask & Demanded;
 
   // Find the width of the shrunk mask.
   unsigned Width = ShrunkMask.getActiveBits();
 
   // If the mask is all 0s there's nothing to do here.
   if (Width == 0)
     return false;
 
   // Find the next power of 2 width, rounding up to a byte.
   Width = PowerOf2Ceil(std::max(Width, 8U));
   // Truncate the width to size to handle illegal types.
   Width = std::min(Width, Size);
 
   // Calculate a possible zero extend mask for this constant.
   APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
 
   // If we aren't changing the mask, just return true to keep it and prevent
   // the caller from optimizing.
   if (ZeroExtendMask == Mask)
     return true;
 
   // Make sure the new mask can be represented by a combination of mask bits
   // and non-demanded bits.
   if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
     return false;
 
   // Replace the constant with the zero extend mask.
   SDLoc DL(Op);
   SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
   SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
   return TLO.CombineTo(Op, NewOp);
 }
 
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   unsigned BitWidth = Known.getBitWidth();
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
   assert((Opc >= ISD::BUILTIN_OP_END ||
           Opc == ISD::INTRINSIC_WO_CHAIN ||
           Opc == ISD::INTRINSIC_W_CHAIN ||
           Opc == ISD::INTRINSIC_VOID) &&
          "Should use MaskedValueIsZero if you don't know whether Op"
          " is a target node!");
 
   Known.resetAll();
   switch (Opc) {
   default: break;
   case X86ISD::SETCC:
     Known.Zero.setBitsFrom(1);
     break;
   case X86ISD::MOVMSK: {
     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
     Known.Zero.setBitsFrom(NumLoBits);
     break;
   }
   case X86ISD::PEXTRB:
   case X86ISD::PEXTRW: {
     SDValue Src = Op.getOperand(0);
     EVT SrcVT = Src.getValueType();
     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
                                             Op.getConstantOperandVal(1));
     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
     Known = Known.zextOrTrunc(BitWidth);
     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
     break;
   }
   case X86ISD::VSRAI:
   case X86ISD::VSHLI:
   case X86ISD::VSRLI: {
     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
         Known.setAllZero();
         break;
       }
 
       Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
       unsigned ShAmt = ShiftImm->getZExtValue();
       if (Opc == X86ISD::VSHLI) {
         Known.Zero <<= ShAmt;
         Known.One <<= ShAmt;
         // Low bits are known zero.
         Known.Zero.setLowBits(ShAmt);
       } else if (Opc == X86ISD::VSRLI) {
         Known.Zero.lshrInPlace(ShAmt);
         Known.One.lshrInPlace(ShAmt);
         // High bits are known zero.
         Known.Zero.setHighBits(ShAmt);
       } else {
         Known.Zero.ashrInPlace(ShAmt);
         Known.One.ashrInPlace(ShAmt);
       }
     }
     break;
   }
   case X86ISD::PACKUS: {
     // PACKUS is just a truncation if the upper half is zero.
     APInt DemandedLHS, DemandedRHS;
     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
 
     Known.One = APInt::getAllOnesValue(BitWidth * 2);
     Known.Zero = APInt::getAllOnesValue(BitWidth * 2);
 
     KnownBits Known2;
     if (!!DemandedLHS) {
       Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
     if (!!DemandedRHS) {
       Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
 
     if (Known.countMinLeadingZeros() < BitWidth)
       Known.resetAll();
     Known = Known.trunc(BitWidth);
     break;
   }
   case X86ISD::CMOV: {
     Known = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
     // If we don't know any bits, early out.
     if (Known.isUnknown())
       break;
     KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
 
     // Only known if known in both the LHS and RHS.
     Known.One &= Known2.One;
     Known.Zero &= Known2.Zero;
     break;
   }
   }
 
   // Handle target shuffles.
   // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
   if (isTargetShuffle(Opc)) {
     bool IsUnary;
     SmallVector<int, 64> Mask;
     SmallVector<SDValue, 2> Ops;
     if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
                              IsUnary)) {
       unsigned NumOps = Ops.size();
       unsigned NumElts = VT.getVectorNumElements();
       if (Mask.size() == NumElts) {
         SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
         Known.Zero.setAllBits(); Known.One.setAllBits();
         for (unsigned i = 0; i != NumElts; ++i) {
           if (!DemandedElts[i])
             continue;
           int M = Mask[i];
           if (M == SM_SentinelUndef) {
             // For UNDEF elements, we don't know anything about the common state
             // of the shuffle result.
             Known.resetAll();
             break;
           } else if (M == SM_SentinelZero) {
             Known.One.clearAllBits();
             continue;
           }
           assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
                  "Shuffle index out of range");
 
           unsigned OpIdx = (unsigned)M / NumElts;
           unsigned EltIdx = (unsigned)M % NumElts;
           if (Ops[OpIdx].getValueType() != VT) {
             // TODO - handle target shuffle ops with different value types.
             Known.resetAll();
             break;
           }
           DemandedOps[OpIdx].setBit(EltIdx);
         }
         // Known bits are the values that are shared by every demanded element.
         for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
           if (!DemandedOps[i])
             continue;
           KnownBits Known2 =
               DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
           Known.One &= Known2.One;
           Known.Zero &= Known2.Zero;
         }
       }
     }
   }
 }
 
 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     unsigned Depth) const {
   unsigned VTBits = Op.getScalarValueSizeInBits();
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
   case X86ISD::SETCC_CARRY:
     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
     return VTBits;
 
   case X86ISD::VTRUNC: {
     // TODO: Add DemandedElts support.
     SDValue Src = Op.getOperand(0);
     unsigned NumSrcBits = Src.getScalarValueSizeInBits();
     assert(VTBits < NumSrcBits && "Illegal truncation input type");
     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
     if (Tmp > (NumSrcBits - VTBits))
       return Tmp - (NumSrcBits - VTBits);
     return 1;
   }
 
   case X86ISD::PACKSS: {
     // PACKSS is just a truncation if the sign bits extend to the packed size.
     APInt DemandedLHS, DemandedRHS;
     getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
                         DemandedRHS);
 
     unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
     unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
     if (!!DemandedLHS)
       Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);
     if (!!DemandedRHS)
       Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);
     unsigned Tmp = std::min(Tmp0, Tmp1);
     if (Tmp > (SrcBits - VTBits))
       return Tmp - (SrcBits - VTBits);
     return 1;
   }
 
   case X86ISD::VSHLI: {
     SDValue Src = Op.getOperand(0);
     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
     if (ShiftVal.uge(VTBits))
       return VTBits; // Shifted all bits out --> zero.
     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
     if (ShiftVal.uge(Tmp))
       return 1; // Shifted all sign bits out --> unknown.
     return Tmp - ShiftVal.getZExtValue();
   }
 
   case X86ISD::VSRAI: {
     SDValue Src = Op.getOperand(0);
     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
     if (ShiftVal.uge(VTBits - 1))
       return VTBits; // Sign splat.
     unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
     ShiftVal += Tmp;
     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
   }
 
   case X86ISD::PCMPGT:
   case X86ISD::PCMPEQ:
   case X86ISD::CMPP:
   case X86ISD::VPCOM:
   case X86ISD::VPCOMU:
     // Vector compares return zero/all-bits result values.
     return VTBits;
 
   case X86ISD::CMOV: {
     unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
     if (Tmp0 == 1) return 1;  // Early out.
     unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
     return std::min(Tmp0, Tmp1);
   }
   }
 
   // Fallback case.
   return 1;
 }
 
 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
   if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
     return N->getOperand(0);
   return N;
 }
 
 // Attempt to match a combined shuffle mask against supported unary shuffle
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                     bool AllowFloatDomain, bool AllowIntDomain,
                                     SDValue &V1, const SDLoc &DL,
                                     SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget,
                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
   // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
   if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
       isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
     Shuffle = X86ISD::VZEXT_MOVL;
     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
     return true;
   }
 
   // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
     unsigned MaxScale = 64 / MaskEltSize;
     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
       bool Match = true;
       unsigned NumDstElts = NumMaskElts / Scale;
       for (unsigned i = 0; i != NumDstElts && Match; ++i) {
         Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
       }
       if (Match) {
         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
         MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
                                             MVT::getIntegerVT(MaskEltSize);
         SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
 
         if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits())
           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
 
         if (SrcVT.getVectorNumElements() == NumDstElts)
           Shuffle = unsigned(ISD::ZERO_EXTEND);
         else
           Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
 
         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
         return true;
       }
     }
   }
 
   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
       isUndefOrEqual(Mask[0], 0) &&
       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
     Shuffle = X86ISD::VZEXT_MOVL;
     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
     return true;
   }
 
   // Check if we have SSE3 which will let us use MOVDDUP etc. The
   // instructions are no slower than UNPCKLPD but has the option to
   // fold the input operand into even an unaligned memory load.
   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
     if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v2f64;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
   }
 
   if (MaskVT.is256BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v4f64;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v8f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v8f32;
       return true;
     }
   }
 
   if (MaskVT.is512BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX512() &&
            "AVX512 required for 512-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v8f64;
       return true;
     }
     if (isTargetShuffleEquivalent(
             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
       Shuffle = X86ISD::MOVSLDUP;
       SrcVT = DstVT = MVT::v16f32;
       return true;
     }
     if (isTargetShuffleEquivalent(
             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
       Shuffle = X86ISD::MOVSHDUP;
       SrcVT = DstVT = MVT::v16f32;
       return true;
     }
   }
 
   // Attempt to match against broadcast-from-vector.
   if (Subtarget.hasAVX2()) {
     SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
       SrcVT = DstVT = MaskVT;
       Shuffle = X86ISD::VBROADCAST;
       return true;
     }
   }
 
   return false;
 }
 
 // Attempt to match a combined shuffle mask against supported unary immediate
 // permute instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                            const APInt &Zeroable,
                                            bool AllowFloatDomain,
                                            bool AllowIntDomain,
                                            const X86Subtarget &Subtarget,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
   unsigned InputSizeInBits = MaskVT.getSizeInBits();
   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
 
   bool ContainsZeros =
       llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
 
   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
     // Check for lane crossing permutes.
     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
         Shuffle = X86ISD::VPERMI;
         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
         PermuteImm = getV4X86ShuffleImm(Mask);
         return true;
       }
       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
         SmallVector<int, 4> RepeatedMask;
         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
           Shuffle = X86ISD::VPERMI;
           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
           return true;
         }
       }
     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
       // VPERMILPD can permute with a non-repeating shuffle.
       Shuffle = X86ISD::VPERMILPI;
       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
       PermuteImm = 0;
       for (int i = 0, e = Mask.size(); i != e; ++i) {
         int M = Mask[i];
         if (M == SM_SentinelUndef)
           continue;
         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
         PermuteImm |= (M & 1) << i;
       }
       return true;
     }
   }
 
   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
     SmallVector<int, 4> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
       // Narrow the repeated mask to create 32-bit element permutes.
       SmallVector<int, 4> WordMask = RepeatedMask;
       if (MaskScalarSizeInBits == 64)
         scaleShuffleMask<int>(2, RepeatedMask, WordMask);
 
       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
       PermuteImm = getV4X86ShuffleImm(WordMask);
       return true;
     }
   }
 
   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
     SmallVector<int, 4> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
       ArrayRef<int> LoMask(Mask.data() + 0, 4);
       ArrayRef<int> HiMask(Mask.data() + 4, 4);
 
       // PSHUFLW: permute lower 4 elements only.
       if (isUndefOrInRange(LoMask, 0, 4) &&
           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
         Shuffle = X86ISD::PSHUFLW;
         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
         PermuteImm = getV4X86ShuffleImm(LoMask);
         return true;
       }
 
       // PSHUFHW: permute upper 4 elements only.
       if (isUndefOrInRange(HiMask, 4, 8) &&
           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
         // Offset the HiMask so that we can create the shuffle immediate.
         int OffsetHiMask[4];
         for (int i = 0; i != 4; ++i)
           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
 
         Shuffle = X86ISD::PSHUFHW;
         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
         return true;
       }
     }
   }
 
   // Attempt to match against byte/bit shifts.
   // FIXME: Add 512-bit support.
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
                                              MaskScalarSizeInBits, Mask,
                                              0, Zeroable, Subtarget);
     if (0 < ShiftAmt) {
       PermuteImm = (unsigned)ShiftAmt;
       return true;
     }
   }
 
   return false;
 }
 
 // Attempt to match a combined unary shuffle mask against supported binary
 // shuffle instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                      bool AllowFloatDomain, bool AllowIntDomain,
                                      SDValue &V1, SDValue &V2, const SDLoc &DL,
                                      SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget,
                                      unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
                                      bool IsUnary) {
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   if (MaskVT.is128BitVector()) {
     if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
       V2 = V1;
       V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
       SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
         (AllowFloatDomain || !Subtarget.hasSSE41())) {
       std::swap(V1, V2);
       Shuffle = X86ISD::MOVSD;
       SrcVT = DstVT = MVT::v2f64;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
         (AllowFloatDomain || !Subtarget.hasSSE41())) {
       Shuffle = X86ISD::MOVSS;
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
   }
 
   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
   if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
       ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
       ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
     if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
                                    Subtarget)) {
       DstVT = MaskVT;
       return true;
     }
   }
 
   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
     if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
                                     DAG, Subtarget)) {
       SrcVT = DstVT = MaskVT;
       if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
         SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
       return true;
     }
   }
 
   return false;
 }
 
 static bool matchBinaryPermuteVectorShuffle(
     MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
     bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
     const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
     unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   // Attempt to match against PALIGNR byte rotate.
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
     if (0 < ByteRotation) {
       Shuffle = X86ISD::PALIGNR;
       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
       PermuteImm = ByteRotation;
       return true;
     }
   }
 
   // Attempt to combine to X86ISD::BLENDI.
   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
     uint64_t BlendMask = 0;
     bool ForceV1Zero = false, ForceV2Zero = false;
     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
     if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
                                   BlendMask)) {
       if (MaskVT == MVT::v16i16) {
         // We can only use v16i16 PBLENDW if the lanes are repeated.
         SmallVector<int, 8> RepeatedMask;
         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
                                         RepeatedMask)) {
           assert(RepeatedMask.size() == 8 &&
                  "Repeated mask size doesn't match!");
           PermuteImm = 0;
           for (int i = 0; i < 8; ++i)
             if (RepeatedMask[i] >= 8)
               PermuteImm |= 1 << i;
           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
           Shuffle = X86ISD::BLENDI;
           ShuffleVT = MaskVT;
           return true;
         }
       } else {
         // Determine a type compatible with X86ISD::BLENDI.
         ShuffleVT = MaskVT;
         if (Subtarget.hasAVX2()) {
           if (ShuffleVT == MVT::v4i64)
             ShuffleVT = MVT::v8i32;
           else if (ShuffleVT == MVT::v2i64)
             ShuffleVT = MVT::v4i32;
         } else {
           if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
             ShuffleVT = MVT::v8i16;
           else if (ShuffleVT == MVT::v4i64)
             ShuffleVT = MVT::v4f64;
           else if (ShuffleVT == MVT::v8i32)
             ShuffleVT = MVT::v8f32;
         }
 
         if (!ShuffleVT.isFloatingPoint()) {
           int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
           BlendMask =
               scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
           ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
           ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
         }
 
         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
         PermuteImm = (unsigned)BlendMask;
         Shuffle = X86ISD::BLENDI;
         return true;
       }
     }
   }
 
   // Attempt to combine to INSERTPS.
   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
       MaskVT.is128BitVector()) {
     if (Zeroable.getBoolValue() &&
         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
       Shuffle = X86ISD::INSERTPS;
       ShuffleVT = MVT::v4f32;
       return true;
     }
   }
 
   // Attempt to combine to SHUFPD.
   if (AllowFloatDomain && EltSizeInBits == 64 &&
       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
       Shuffle = X86ISD::SHUFP;
       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
       return true;
     }
   }
 
   // Attempt to combine to SHUFPS.
   if (AllowFloatDomain && EltSizeInBits == 32 &&
       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     SmallVector<int, 4> RepeatedMask;
     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
       // Match each half of the repeated mask, to determine if its just
       // referencing one of the vectors, is zeroable or entirely undef.
       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
         int M0 = RepeatedMask[Offset];
         int M1 = RepeatedMask[Offset + 1];
 
         if (isUndefInRange(RepeatedMask, Offset, 2)) {
           return DAG.getUNDEF(MaskVT);
         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
           return getZeroVector(MaskVT, Subtarget, DAG, DL);
         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
           return V1;
         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
           return V2;
         }
 
         return SDValue();
       };
 
       int ShufMask[4] = {-1, -1, -1, -1};
       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
 
       if (Lo && Hi) {
         V1 = Lo;
         V2 = Hi;
         Shuffle = X86ISD::SHUFP;
         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
         PermuteImm = getV4X86ShuffleImm(ShufMask);
         return true;
       }
     }
   }
 
   return false;
 }
 
 /// Combine an arbitrary chain of shuffles into a single instruction if
 /// possible.
 ///
 /// This is the leaf of the recursive combine below. When we have found some
 /// chain of single-use x86 shuffle instructions and accumulated the combined
 /// shuffle mask represented by them, this will try to pattern match that mask
 /// into either a single instruction if there is a special purpose instruction
 /// for this operation, or into a PSHUFB instruction which is a fully general
 /// instruction but should only be used to replace chains over a certain depth.
 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                                       ArrayRef<int> BaseMask, int Depth,
                                       bool HasVariableMask,
                                       bool AllowVariableMask, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
          "Unexpected number of shuffle inputs!");
 
   // Find the inputs that enter the chain. Note that multiple uses are OK
   // here, we're not going to remove the operands we find.
   bool UnaryShuffle = (Inputs.size() == 1);
   SDValue V1 = peekThroughBitcasts(Inputs[0]);
   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
                              : peekThroughBitcasts(Inputs[1]));
 
   MVT VT1 = V1.getSimpleValueType();
   MVT VT2 = V2.getSimpleValueType();
   MVT RootVT = Root.getSimpleValueType();
   assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
          VT2.getSizeInBits() == RootVT.getSizeInBits() &&
          "Vector size mismatch");
 
   SDLoc DL(Root);
   SDValue Res;
 
   unsigned NumBaseMaskElts = BaseMask.size();
   if (NumBaseMaskElts == 1) {
     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
     return DAG.getBitcast(RootVT, V1);
   }
 
   unsigned RootSizeInBits = RootVT.getSizeInBits();
   unsigned NumRootElts = RootVT.getVectorNumElements();
   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
                      (RootVT.isFloatingPoint() && Depth >= 2) ||
                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
 
   // Don't combine if we are a AVX512/EVEX target and the mask element size
   // is different from the root element size - this would prevent writemasks
   // from being reused.
   // TODO - this currently prevents all lane shuffles from occurring.
   // TODO - check for writemasks usage instead of always preventing combining.
   // TODO - attempt to narrow Mask back to writemask size.
   bool IsEVEXShuffle =
       RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
 
   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
 
   // Handle 128-bit lane shuffles of 256-bit vectors.
   // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
   // we need to use the zeroing feature.
   // TODO - this should support binary shuffles.
   if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
       !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
       return SDValue(); // Nothing to do!
     MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
     unsigned PermMask = 0;
     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
 
     Res = DAG.getBitcast(ShuffleVT, V1);
     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
                       DAG.getUNDEF(ShuffleVT),
                       DAG.getConstant(PermMask, DL, MVT::i8));
     return DAG.getBitcast(RootVT, Res);
   }
 
   // For masks that have been widened to 128-bit elements or more,
   // narrow back down to 64-bit elements.
   SmallVector<int, 64> Mask;
   if (BaseMaskEltSizeInBits > 64) {
     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
     int MaskScale = BaseMaskEltSizeInBits / 64;
     scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
   } else {
     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
   }
 
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
 
   // Determine the effective mask value type.
   FloatDomain &= (32 <= MaskEltSizeInBits);
   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
                            : MVT::getIntegerVT(MaskEltSizeInBits);
   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
 
   // Only allow legal mask types.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
     return SDValue();
 
   // Attempt to match the mask against known shuffle patterns.
   MVT ShuffleSrcVT, ShuffleVT;
   unsigned Shuffle, PermuteImm;
 
   // Which shuffle domains are permitted?
   // Permit domain crossing at higher combine depths.
   bool AllowFloatDomain = FloatDomain || (Depth > 3);
   bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
 
   // Determine zeroable mask elements.
   APInt Zeroable(NumMaskElts, 0);
   for (unsigned i = 0; i != NumMaskElts; ++i)
     if (isUndefOrZero(Mask[i]))
       Zeroable.setBit(i);
 
   if (UnaryShuffle) {
     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
     // directly if we don't shuffle the lower element and we shuffle the upper
     // (zero) elements within themselves.
     if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
         (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
       unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
       ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
       if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
           isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
         return DAG.getBitcast(RootVT, V1);
       }
     }
 
     SDValue NewV1 = V1; // Save operand in case early exit happens.
     if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
                                 NewV1, DL, DAG, Subtarget, Shuffle,
                                 ShuffleSrcVT, ShuffleVT) &&
         (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
       return DAG.getBitcast(RootVT, Res);
     }
 
     if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
                                        AllowIntDomain, Subtarget, Shuffle,
                                        ShuffleVT, PermuteImm) &&
         (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleVT, V1);
       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
                         DAG.getConstant(PermuteImm, DL, MVT::i8));
       return DAG.getBitcast(RootVT, Res);
     }
   }
 
   SDValue NewV1 = V1; // Save operands in case early exit happens.
   SDValue NewV2 = V2;
   if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
                                NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
                                ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
       (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
     NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
     return DAG.getBitcast(RootVT, Res);
   }
 
   NewV1 = V1; // Save operands in case early exit happens.
   NewV2 = V2;
   if (matchBinaryPermuteVectorShuffle(
           MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
           NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
       (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
     NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
     Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
                       DAG.getConstant(PermuteImm, DL, MVT::i8));
     return DAG.getBitcast(RootVT, Res);
   }
 
   // Typically from here on, we need an integer version of MaskVT.
   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
 
   // Annoyingly, SSE4A instructions don't map into the above match helpers.
   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
     uint64_t BitLen, BitIdx;
     if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
                                   Zeroable)) {
       if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
         return SDValue(); // Nothing to do!
       V1 = DAG.getBitcast(IntMaskVT, V1);
       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
                         DAG.getConstant(BitLen, DL, MVT::i8),
                         DAG.getConstant(BitIdx, DL, MVT::i8));
       return DAG.getBitcast(RootVT, Res);
     }
 
     if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
         return SDValue(); // Nothing to do!
       V1 = DAG.getBitcast(IntMaskVT, V1);
       V2 = DAG.getBitcast(IntMaskVT, V2);
       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
                         DAG.getConstant(BitLen, DL, MVT::i8),
                         DAG.getConstant(BitIdx, DL, MVT::i8));
       return DAG.getBitcast(RootVT, Res);
     }
   }
 
   // Don't try to re-form single instruction chains under any circumstances now
   // that we've done encoding canonicalization for them.
   if (Depth < 2)
     return SDValue();
 
   // Depth threshold above which we can efficiently use variable mask shuffles.
   int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
   AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
 
   bool MaskContainsZeros =
       any_of(Mask, [](int M) { return M == SM_SentinelZero; });
 
   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
     // If we have a single input lane-crossing shuffle then lower to VPERMV.
     if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
         ((Subtarget.hasAVX2() &&
           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
          (Subtarget.hasAVX512() &&
           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       Res = DAG.getBitcast(MaskVT, V1);
       Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
       return DAG.getBitcast(RootVT, Res);
     }
 
     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
     // vector as the second source.
     if (UnaryShuffle && AllowVariableMask &&
         ((Subtarget.hasAVX512() &&
           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
          (Subtarget.hasVLX() &&
           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
       for (unsigned i = 0; i != NumMaskElts; ++i)
         if (Mask[i] == SM_SentinelZero)
           Mask[i] = NumMaskElts + i;
 
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       Res = DAG.getBitcast(MaskVT, V1);
       SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
       return DAG.getBitcast(RootVT, Res);
     }
 
     // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
     if (AllowVariableMask && !MaskContainsZeros &&
         ((Subtarget.hasAVX512() &&
           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
          (Subtarget.hasVLX() &&
           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
       V1 = DAG.getBitcast(MaskVT, V1);
       V2 = DAG.getBitcast(MaskVT, V2);
       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
       return DAG.getBitcast(RootVT, Res);
     }
     return SDValue();
   }
 
   // See if we can combine a single input shuffle with zeros to a bit-mask,
   // which is much simpler than any shuffle.
   if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
     APInt UndefElts(NumMaskElts, 0);
     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
     for (unsigned i = 0; i != NumMaskElts; ++i) {
       int M = Mask[i];
       if (M == SM_SentinelUndef) {
         UndefElts.setBit(i);
         continue;
       }
       if (M == SM_SentinelZero)
         continue;
       EltBits[i] = AllOnes;
     }
     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
     Res = DAG.getBitcast(MaskVT, V1);
     unsigned AndOpcode =
         FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
     return DAG.getBitcast(RootVT, Res);
   }
 
   // If we have a single input shuffle with different shuffle patterns in the
   // the 128-bit lanes use the variable mask to VPERMILPS.
   // TODO Combine other mask types at higher depths.
   if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
     SmallVector<SDValue, 16> VPermIdx;
     for (int M : Mask) {
       SDValue Idx =
           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
       VPermIdx.push_back(Idx);
     }
     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
     Res = DAG.getBitcast(MaskVT, V1);
     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
     return DAG.getBitcast(RootVT, Res);
   }
 
   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
   // to VPERMIL2PD/VPERMIL2PS.
   if (AllowVariableMask && Subtarget.hasXOP() &&
       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
        MaskVT == MVT::v8f32)) {
     // VPERMIL2 Operation.
     // Bits[3] - Match Bit.
     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
     SmallVector<int, 8> VPerm2Idx;
     unsigned M2ZImm = 0;
     for (int M : Mask) {
       if (M == SM_SentinelUndef) {
         VPerm2Idx.push_back(-1);
         continue;
       }
       if (M == SM_SentinelZero) {
         M2ZImm = 2;
         VPerm2Idx.push_back(8);
         continue;
       }
       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
       VPerm2Idx.push_back(Index);
     }
     V1 = DAG.getBitcast(MaskVT, V1);
     V2 = DAG.getBitcast(MaskVT, V2);
     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
                       DAG.getConstant(M2ZImm, DL, MVT::i8));
     return DAG.getBitcast(RootVT, Res);
   }
 
   // If we have 3 or more shuffle instructions or a chain involving a variable
   // mask, we can replace them with a single PSHUFB instruction profitably.
   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
   // instructions, but in practice PSHUFB tends to be *very* fast so we're
   // more aggressive.
   if (UnaryShuffle && AllowVariableMask &&
       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
     SmallVector<SDValue, 16> PSHUFBMask;
     int NumBytes = RootVT.getSizeInBits() / 8;
     int Ratio = NumBytes / NumMaskElts;
     for (int i = 0; i < NumBytes; ++i) {
       int M = Mask[i / Ratio];
       if (M == SM_SentinelUndef) {
         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
         continue;
       }
       if (M == SM_SentinelZero) {
         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
         continue;
       }
       M = Ratio * M + i % Ratio;
       assert((M / 16) == (i / 16) && "Lane crossing detected");
       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
     }
     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
     Res = DAG.getBitcast(ByteVT, V1);
     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
     return DAG.getBitcast(RootVT, Res);
   }
 
   // With XOP, if we have a 128-bit binary input shuffle we can always combine
   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
   // slower than PSHUFB on targets that support both.
   if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
     // VPPERM Mask Operation
     // Bits[4:0] - Byte Index (0 - 31)
     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
     SmallVector<SDValue, 16> VPPERMMask;
     int NumBytes = 16;
     int Ratio = NumBytes / NumMaskElts;
     for (int i = 0; i < NumBytes; ++i) {
       int M = Mask[i / Ratio];
       if (M == SM_SentinelUndef) {
         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
         continue;
       }
       if (M == SM_SentinelZero) {
         VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
         continue;
       }
       M = Ratio * M + i % Ratio;
       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
     }
     MVT ByteVT = MVT::v16i8;
     V1 = DAG.getBitcast(ByteVT, V1);
     V2 = DAG.getBitcast(ByteVT, V2);
     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
     return DAG.getBitcast(RootVT, Res);
   }
 
   // Failed to find any combines.
   return SDValue();
 }
 
 // Attempt to constant fold all of the constant source ops.
 // Returns true if the entire shuffle is folded to a constant.
 // TODO: Extend this to merge multiple constant Ops and update the mask.
 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
                                            ArrayRef<int> Mask, SDValue Root,
                                            bool HasVariableMask,
                                            SelectionDAG &DAG,
                                            const X86Subtarget &Subtarget) {
   MVT VT = Root.getSimpleValueType();
 
   unsigned SizeInBits = VT.getSizeInBits();
   unsigned NumMaskElts = Mask.size();
   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
   unsigned NumOps = Ops.size();
 
   // Extract constant bits from each source op.
   bool OneUseConstantOp = false;
   SmallVector<APInt, 16> UndefEltsOps(NumOps);
   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
   for (unsigned i = 0; i != NumOps; ++i) {
     SDValue SrcOp = Ops[i];
     OneUseConstantOp |= SrcOp.hasOneUse();
     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
                                        RawBitsOps[i]))
       return SDValue();
   }
 
   // Only fold if at least one of the constants is only used once or
   // the combined shuffle has included a variable mask shuffle, this
   // is to avoid constant pool bloat.
   if (!OneUseConstantOp && !HasVariableMask)
     return SDValue();
 
   // Shuffle the constant bits according to the mask.
   APInt UndefElts(NumMaskElts, 0);
   APInt ZeroElts(NumMaskElts, 0);
   APInt ConstantElts(NumMaskElts, 0);
   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
                                         APInt::getNullValue(MaskSizeInBits));
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef) {
       UndefElts.setBit(i);
       continue;
     } else if (M == SM_SentinelZero) {
       ZeroElts.setBit(i);
       continue;
     }
     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
 
     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
 
     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
     if (SrcUndefElts[SrcMaskIdx]) {
       UndefElts.setBit(i);
       continue;
     }
 
     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
     APInt &Bits = SrcEltBits[SrcMaskIdx];
     if (!Bits) {
       ZeroElts.setBit(i);
       continue;
     }
 
     ConstantElts.setBit(i);
     ConstantBitData[i] = Bits;
   }
   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
 
   // Create the constant data.
   MVT MaskSVT;
   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
   else
     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
 
   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
 
   SDLoc DL(Root);
   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
   return DAG.getBitcast(VT, CstOp);
 }
 
 /// Fully generic combining of x86 shuffle instructions.
 ///
 /// This should be the last combine run over the x86 shuffle instructions. Once
 /// they have been fully optimized, this will recursively consider all chains
 /// of single-use shuffle instructions, build a generic model of the cumulative
 /// shuffle operation, and check for simpler instructions which implement this
 /// operation. We use this primarily for two purposes:
 ///
 /// 1) Collapse generic shuffles to specialized single instructions when
 ///    equivalent. In most cases, this is just an encoding size win, but
 ///    sometimes we will collapse multiple generic shuffles into a single
 ///    special-purpose shuffle.
 /// 2) Look for sequences of shuffle instructions with 3 or more total
 ///    instructions, and replace them with the slightly more expensive SSSE3
 ///    PSHUFB instruction if available. We do this as the last combining step
 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
 ///    a suitable short sequence of other instructions. The PSHUFB will either
 ///    use a register or have to read from memory and so is slightly (but only
 ///    slightly) more expensive than the other shuffle instructions.
 ///
 /// Because this is inherently a quadratic operation (for each shuffle in
 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
 /// This should never be an issue in practice as the shuffle lowering doesn't
 /// produce sequences of more than 8 instructions.
 ///
 /// FIXME: We will currently miss some cases where the redundant shuffling
 /// would simplify under the threshold for PSHUFB formation because of
 /// combine-ordering. To fix this, we should do the redundant instruction
 /// combining in this recursive walk.
 static SDValue combineX86ShufflesRecursively(
     ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
     ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
     bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
     const X86Subtarget &Subtarget) {
   // Bound the depth of our recursive combine because this is ultimately
   // quadratic in nature.
   const unsigned MaxRecursionDepth = 8;
   if (Depth > MaxRecursionDepth)
     return SDValue();
 
   // Directly rip through bitcasts to find the underlying operand.
   SDValue Op = SrcOps[SrcOpIndex];
   Op = peekThroughOneUseBitcasts(Op);
 
   MVT VT = Op.getSimpleValueType();
   if (!VT.isVector())
     return SDValue(); // Bail if we hit a non-vector.
 
   assert(Root.getSimpleValueType().isVector() &&
          "Shuffles operate on vector types!");
   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
          "Can only combine shuffles of the same vector register size.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
   SmallVector<int, 64> OpMask;
   SmallVector<SDValue, 2> OpInputs;
   if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
     return SDValue();
 
   // TODO - Add support for more than 2 inputs.
   if (2 < OpInputs.size())
     return SDValue();
 
   SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
   SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
 
   // Add the inputs to the Ops list, avoiding duplicates.
   SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
 
   auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
     if (!Input)
       return -1;
     // Attempt to find an existing match.
     SDValue InputBC = peekThroughBitcasts(Input);
     for (int i = 0, e = Ops.size(); i < e; ++i)
       if (InputBC == peekThroughBitcasts(Ops[i]))
         return i;
     // Match failed - should we replace an existing Op?
     if (InsertionPoint >= 0) {
       Ops[InsertionPoint] = Input;
       return InsertionPoint;
     }
     // Add to the end of the Ops list.
     Ops.push_back(Input);
     return Ops.size() - 1;
   };
 
   int InputIdx0 = AddOp(Input0, SrcOpIndex);
   int InputIdx1 = AddOp(Input1, -1);
 
   assert(((RootMask.size() > OpMask.size() &&
            RootMask.size() % OpMask.size() == 0) ||
           (OpMask.size() > RootMask.size() &&
            OpMask.size() % RootMask.size() == 0) ||
           OpMask.size() == RootMask.size()) &&
          "The smaller number of elements must divide the larger.");
 
   // This function can be performance-critical, so we rely on the power-of-2
   // knowledge that we have about the mask sizes to replace div/rem ops with
   // bit-masks and shifts.
   assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
   assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
   unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
   unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
 
   unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
   unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
   unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
   assert((RootRatio == 1 || OpRatio == 1) &&
          "Must not have a ratio for both incoming and op masks!");
 
   assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
   assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
   assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
   unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
   unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
 
   SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
 
   // Merge this shuffle operation's mask into our accumulated mask. Note that
   // this shuffle's mask will be the first applied to the input, followed by the
   // root mask to get us all the way to the root value arrangement. The reason
   // for this order is that we are recursing up the operation chain.
   for (unsigned i = 0; i < MaskWidth; ++i) {
     unsigned RootIdx = i >> RootRatioLog2;
     if (RootMask[RootIdx] < 0) {
       // This is a zero or undef lane, we're done.
       Mask[i] = RootMask[RootIdx];
       continue;
     }
 
     unsigned RootMaskedIdx =
         RootRatio == 1
             ? RootMask[RootIdx]
             : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
 
     // Just insert the scaled root mask value if it references an input other
     // than the SrcOp we're currently inserting.
     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
       Mask[i] = RootMaskedIdx;
       continue;
     }
 
     RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
     unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
     if (OpMask[OpIdx] < 0) {
       // The incoming lanes are zero or undef, it doesn't matter which ones we
       // are using.
       Mask[i] = OpMask[OpIdx];
       continue;
     }
 
     // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
     unsigned OpMaskedIdx =
         OpRatio == 1
             ? OpMask[OpIdx]
             : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
 
     OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
     if (OpMask[OpIdx] < (int)OpMask.size()) {
       assert(0 <= InputIdx0 && "Unknown target shuffle input");
       OpMaskedIdx += InputIdx0 * MaskWidth;
     } else {
       assert(0 <= InputIdx1 && "Unknown target shuffle input");
       OpMaskedIdx += InputIdx1 * MaskWidth;
     }
 
     Mask[i] = OpMaskedIdx;
   }
 
   // Handle the all undef/zero cases early.
   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
     return DAG.getUNDEF(Root.getValueType());
 
   // TODO - should we handle the mixed zero/undef case as well? Just returning
   // a zero mask will lose information on undef elements possibly reducing
   // future combine possibilities.
   if (all_of(Mask, [](int Idx) { return Idx < 0; }))
     return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
                          SDLoc(Root));
 
   // Remove unused shuffle source ops.
   resolveTargetShuffleInputsAndMask(Ops, Mask);
   assert(!Ops.empty() && "Shuffle with no inputs detected");
 
   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
 
   // Update the list of shuffle nodes that have been combined so far.
   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
                                                 SrcNodes.end());
   CombinedNodes.push_back(Op.getNode());
 
   // See if we can recurse into each shuffle source op (if it's a target
   // shuffle). The source op should only be generally combined if it either has
   // a single use (i.e. current Op) or all its users have already been combined,
   // if not then we can still combine but should prevent generation of variable
   // shuffles to avoid constant pool bloat.
   // Don't recurse if we already have more source ops than we can combine in
   // the remaining recursion depth.
   if (Ops.size() < (MaxRecursionDepth - Depth)) {
     for (int i = 0, e = Ops.size(); i < e; ++i) {
       bool AllowVar = false;
       if (Ops[i].getNode()->hasOneUse() ||
           SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
         AllowVar = AllowVariableMask;
       if (SDValue Res = combineX86ShufflesRecursively(
               Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
               AllowVar, DAG, Subtarget))
         return Res;
     }
   }
 
   // Attempt to constant fold all of the constant source ops.
   if (SDValue Cst = combineX86ShufflesConstants(
           Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
     return Cst;
 
   // We can only combine unary and binary shuffle mask cases.
   if (Ops.size() > 2)
     return SDValue();
 
   // Minor canonicalization of the accumulated shuffle mask to make it easier
   // to match below. All this does is detect masks with sequential pairs of
   // elements, and shrink them to the half-width mask. It does this in a loop
   // so it will reduce the size of the mask to the minimal width mask which
   // performs an equivalent shuffle.
   SmallVector<int, 64> WidenedMask;
   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
     Mask = std::move(WidenedMask);
   }
 
   // Canonicalization of binary shuffle masks to improve pattern matching by
   // commuting the inputs.
   if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
     ShuffleVectorSDNode::commuteMask(Mask);
     std::swap(Ops[0], Ops[1]);
   }
 
   // Finally, try to combine into a single shuffle instruction.
   return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
                                 AllowVariableMask, DAG, Subtarget);
 }
 
 /// Get the PSHUF-style mask from PSHUF node.
 ///
 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
 /// PSHUF-style masks that can be reused with such instructions.
 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
   MVT VT = N.getSimpleValueType();
   SmallVector<int, 4> Mask;
   SmallVector<SDValue, 2> Ops;
   bool IsUnary;
   bool HaveMask =
       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
   (void)HaveMask;
   assert(HaveMask);
 
   // If we have more than 128-bits, only the low 128-bits of shuffle mask
   // matter. Check that the upper masks are repeats and remove them.
   if (VT.getSizeInBits() > 128) {
     int LaneElts = 128 / VT.getScalarSizeInBits();
 #ifndef NDEBUG
     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
       for (int j = 0; j < LaneElts; ++j)
         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
                "Mask doesn't repeat in high 128-bit lanes!");
 #endif
     Mask.resize(LaneElts);
   }
 
   switch (N.getOpcode()) {
   case X86ISD::PSHUFD:
     return Mask;
   case X86ISD::PSHUFLW:
     Mask.resize(4);
     return Mask;
   case X86ISD::PSHUFHW:
     Mask.erase(Mask.begin(), Mask.begin() + 4);
     for (int &M : Mask)
       M -= 4;
     return Mask;
   default:
     llvm_unreachable("No valid shuffle instruction found!");
   }
 }
 
 /// Search for a combinable shuffle across a chain ending in pshufd.
 ///
 /// We walk up the chain and look for a combinable shuffle, skipping over
 /// shuffles that we could hoist this shuffle's transformation past without
 /// altering anything.
 static SDValue
 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
                              SelectionDAG &DAG) {
   assert(N.getOpcode() == X86ISD::PSHUFD &&
          "Called with something other than an x86 128-bit half shuffle!");
   SDLoc DL(N);
 
   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
   // of the shuffles in the chain so that we can form a fresh chain to replace
   // this one.
   SmallVector<SDValue, 8> Chain;
   SDValue V = N.getOperand(0);
   for (; V.hasOneUse(); V = V.getOperand(0)) {
     switch (V.getOpcode()) {
     default:
       return SDValue(); // Nothing combined!
 
     case ISD::BITCAST:
       // Skip bitcasts as we always know the type for the target specific
       // instructions.
       continue;
 
     case X86ISD::PSHUFD:
       // Found another dword shuffle.
       break;
 
     case X86ISD::PSHUFLW:
       // Check that the low words (being shuffled) are the identity in the
       // dword shuffle, and the high words are self-contained.
       if (Mask[0] != 0 || Mask[1] != 1 ||
           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
         return SDValue();
 
       Chain.push_back(V);
       continue;
 
     case X86ISD::PSHUFHW:
       // Check that the high words (being shuffled) are the identity in the
       // dword shuffle, and the low words are self-contained.
       if (Mask[2] != 2 || Mask[3] != 3 ||
           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
         return SDValue();
 
       Chain.push_back(V);
       continue;
 
     case X86ISD::UNPCKL:
     case X86ISD::UNPCKH:
       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
       // shuffle into a preceding word shuffle.
       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
           V.getSimpleValueType().getVectorElementType() != MVT::i16)
         return SDValue();
 
       // Search for a half-shuffle which we can combine with.
       unsigned CombineOp =
           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
       if (V.getOperand(0) != V.getOperand(1) ||
           !V->isOnlyUserOf(V.getOperand(0).getNode()))
         return SDValue();
       Chain.push_back(V);
       V = V.getOperand(0);
       do {
         switch (V.getOpcode()) {
         default:
           return SDValue(); // Nothing to combine.
 
         case X86ISD::PSHUFLW:
         case X86ISD::PSHUFHW:
           if (V.getOpcode() == CombineOp)
             break;
 
           Chain.push_back(V);
 
           LLVM_FALLTHROUGH;
         case ISD::BITCAST:
           V = V.getOperand(0);
           continue;
         }
         break;
       } while (V.hasOneUse());
       break;
     }
     // Break out of the loop if we break out of the switch.
     break;
   }
 
   if (!V.hasOneUse())
     // We fell out of the loop without finding a viable combining instruction.
     return SDValue();
 
   // Merge this node's mask and our incoming mask.
   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
   for (int &M : Mask)
     M = VMask[M];
   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
 
   // Rebuild the chain around this new shuffle.
   while (!Chain.empty()) {
     SDValue W = Chain.pop_back_val();
 
     if (V.getValueType() != W.getOperand(0).getValueType())
       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
 
     switch (W.getOpcode()) {
     default:
       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
 
     case X86ISD::UNPCKL:
     case X86ISD::UNPCKH:
       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
       break;
 
     case X86ISD::PSHUFD:
     case X86ISD::PSHUFLW:
     case X86ISD::PSHUFHW:
       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
       break;
     }
   }
   if (V.getValueType() != N.getValueType())
     V = DAG.getBitcast(N.getValueType(), V);
 
   // Return the new chain to replace N.
   return V;
 }
 
 /// Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   MVT VT = N.getSimpleValueType();
   SmallVector<int, 4> Mask;
   unsigned Opcode = N.getOpcode();
 
   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
   // single instruction.
   if (VT.getScalarSizeInBits() == 64 &&
       (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
        Opcode == X86ISD::UNPCKL)) {
     auto BC0 = peekThroughBitcasts(N.getOperand(0));
     auto BC1 = peekThroughBitcasts(N.getOperand(1));
     EVT VT0 = BC0.getValueType();
     EVT VT1 = BC1.getValueType();
     unsigned Opcode0 = BC0.getOpcode();
     unsigned Opcode1 = BC1.getOpcode();
     if (Opcode0 == Opcode1 && VT0 == VT1 &&
         (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
          Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
          Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
       SDValue Lo, Hi;
       if (Opcode == X86ISD::MOVSD) {
         Lo = BC1.getOperand(0);
         Hi = BC0.getOperand(1);
       } else {
         Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
         Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
       }
       SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
       return DAG.getBitcast(VT, Horiz);
     }
   }
 
   switch (Opcode) {
   case X86ISD::VBROADCAST: {
     // If broadcasting from another shuffle, attempt to simplify it.
     // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
     SDValue Src = N.getOperand(0);
     SDValue BC = peekThroughBitcasts(Src);
     EVT SrcVT = Src.getValueType();
     EVT BCVT = BC.getValueType();
     if (isTargetShuffle(BC.getOpcode()) &&
         VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
       unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
       SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
                                         SM_SentinelUndef);
       for (unsigned i = 0; i != Scale; ++i)
         DemandedMask[i] = i;
       if (SDValue Res = combineX86ShufflesRecursively(
               {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
               /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
         return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
                            DAG.getBitcast(SrcVT, Res));
     }
     return SDValue();
   }
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFLW:
   case X86ISD::PSHUFHW:
     Mask = getPSHUFShuffleMask(N);
     assert(Mask.size() == 4);
     break;
   case X86ISD::MOVSD:
   case X86ISD::MOVSS: {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
 
     // Canonicalize scalar FPOps:
     // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
     // If commutable, allow OP(N1[0], N0[0]).
     unsigned Opcode1 = N1.getOpcode();
     if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
         Opcode1 == ISD::FDIV) {
       SDValue N10 = N1.getOperand(0);
       SDValue N11 = N1.getOperand(1);
       if (N10 == N0 ||
           (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
         if (N10 != N0)
           std::swap(N10, N11);
         MVT SVT = VT.getVectorElementType();
         SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
         N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
         N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
         SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
         SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
         return DAG.getNode(Opcode, DL, VT, N0, SclVec);
       }
     }
 
     return SDValue();
   }
   case X86ISD::INSERTPS: {
     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
     SDValue Op0 = N.getOperand(0);
     SDValue Op1 = N.getOperand(1);
     SDValue Op2 = N.getOperand(2);
     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
     unsigned ZeroMask = InsertPSMask & 0xF;
 
     // If we zero out all elements from Op0 then we don't need to reference it.
     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
 
     // If we zero out the element from Op1 then we don't need to reference it.
     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
 
     // Attempt to merge insertps Op1 with an inner target shuffle node.
     SmallVector<int, 8> TargetMask1;
     SmallVector<SDValue, 2> Ops1;
     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
       int M = TargetMask1[SrcIdx];
       if (isUndefOrZero(M)) {
         // Zero/UNDEF insertion - zero out element and remove dependency.
         InsertPSMask |= (1u << DstIdx);
         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
       }
       // Update insertps mask srcidx and reference the source input directly.
       assert(0 <= M && M < 8 && "Shuffle index out of range");
       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
       Op1 = Ops1[M < 4 ? 0 : 1];
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
     }
 
     // Attempt to merge insertps Op0 with an inner target shuffle node.
     SmallVector<int, 8> TargetMask0;
     SmallVector<SDValue, 2> Ops0;
     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
       return SDValue();
 
     bool Updated = false;
     bool UseInput00 = false;
     bool UseInput01 = false;
     for (int i = 0; i != 4; ++i) {
       int M = TargetMask0[i];
       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
         // No change if element is already zero or the inserted element.
         continue;
       } else if (isUndefOrZero(M)) {
         // If the target mask is undef/zero then we must zero the element.
         InsertPSMask |= (1u << i);
         Updated = true;
         continue;
       }
 
       // The input vector element must be inline.
       if (M != i && M != (i + 4))
         return SDValue();
 
       // Determine which inputs of the target shuffle we're using.
       UseInput00 |= (0 <= M && M < 4);
       UseInput01 |= (4 <= M);
     }
 
     // If we're not using both inputs of the target shuffle then use the
     // referenced input directly.
     if (UseInput00 && !UseInput01) {
       Updated = true;
       Op0 = Ops0[0];
     } else if (!UseInput00 && UseInput01) {
       Updated = true;
       Op0 = Ops0[1];
     }
 
     if (Updated)
       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
 
     return SDValue();
   }
   default:
     return SDValue();
   }
 
   // Nuke no-op shuffles that show up after combining.
   if (isNoopShuffleMask(Mask))
     return N.getOperand(0);
 
   // Look for simplifications involving one or two shuffle instructions.
   SDValue V = N.getOperand(0);
   switch (N.getOpcode()) {
   default:
     break;
   case X86ISD::PSHUFLW:
   case X86ISD::PSHUFHW:
     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
 
     // See if this reduces to a PSHUFD which is no more expensive and can
     // combine with more operations. Note that it has to at least flip the
     // dwords as otherwise it would have been removed as a no-op.
     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
       int DMask[] = {0, 1, 2, 3};
       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
       DMask[DOffset + 0] = DOffset + 1;
       DMask[DOffset + 1] = DOffset + 0;
       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
       V = DAG.getBitcast(DVT, V);
       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
       return DAG.getBitcast(VT, V);
     }
 
     // Look for shuffle patterns which can be implemented as a single unpack.
     // FIXME: This doesn't handle the location of the PSHUFD generically, and
     // only works when we have a PSHUFD followed by two half-shuffles.
     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
         (V.getOpcode() == X86ISD::PSHUFLW ||
          V.getOpcode() == X86ISD::PSHUFHW) &&
         V.getOpcode() != N.getOpcode() &&
         V.hasOneUse()) {
       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
         int WordMask[8];
         for (int i = 0; i < 4; ++i) {
           WordMask[i + NOffset] = Mask[i] + NOffset;
           WordMask[i + VOffset] = VMask[i] + VOffset;
         }
         // Map the word mask through the DWord mask.
         int MappedMask[8];
         for (int i = 0; i < 8; ++i)
           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
           // We can replace all three shuffles with an unpack.
           V = DAG.getBitcast(VT, D.getOperand(0));
           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
                                                 : X86ISD::UNPCKH,
                              DL, VT, V, V);
         }
       }
     }
 
     break;
 
   case X86ISD::PSHUFD:
     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
       return NewN;
 
     break;
   }
 
   return SDValue();
 }
 
 /// Checks if the shuffle mask takes subsequent elements
 /// alternately from two vectors.
 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
 
   int ParitySrc[2] = {-1, -1};
   unsigned Size = Mask.size();
   for (unsigned i = 0; i != Size; ++i) {
     int M = Mask[i];
     if (M < 0)
       continue;
 
     // Make sure we are using the matching element from the input.
     if ((M % Size) != i)
       return false;
 
     // Make sure we use the same input for all elements of the same parity.
     int Src = M / Size;
     if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
       return false;
     ParitySrc[i % 2] = Src;
   }
 
   // Make sure each input is used.
   if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
     return false;
 
   Op0Even = ParitySrc[0] == 0;
   return true;
 }
 
 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
 /// are written to the parameters \p Opnd0 and \p Opnd1.
 ///
 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
 /// so it is easier to generically match. We also insert dummy vector shuffle
 /// nodes for the operands which explicitly discard the lanes which are unused
 /// by this operation to try to flow through the rest of the combiner
 /// the fact that they're unused.
 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
                              SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
                              bool &IsSubAdd) {
 
   EVT VT = N->getValueType(0);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
       !VT.getSimpleVT().isFloatingPoint())
     return false;
 
   // We only handle target-independent shuffles.
   // FIXME: It would be easy and harmless to use the target shuffle mask
   // extraction tool to support more.
   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
     return false;
 
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
 
   // Make sure we have an FADD and an FSUB.
   if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
       (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
       V1.getOpcode() == V2.getOpcode())
     return false;
 
   // If there are other uses of these operations we can't fold them.
   if (!V1->hasOneUse() || !V2->hasOneUse())
     return false;
 
   // Ensure that both operations have the same operands. Note that we can
   // commute the FADD operands.
   SDValue LHS, RHS;
   if (V1.getOpcode() == ISD::FSUB) {
     LHS = V1->getOperand(0); RHS = V1->getOperand(1);
     if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
         (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
       return false;
   } else {
     assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
     LHS = V2->getOperand(0); RHS = V2->getOperand(1);
     if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
         (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
       return false;
   }
 
   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
   bool Op0Even;
   if (!isAddSubOrSubAddMask(Mask, Op0Even))
     return false;
 
   // It's a subadd if the vector in the even parity is an FADD.
   IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
                      : V2->getOpcode() == ISD::FADD;
 
   Opnd0 = LHS;
   Opnd1 = RHS;
   return true;
 }
 
 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
 static SDValue combineShuffleToFMAddSub(SDNode *N,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // We only handle target-independent shuffles.
   // FIXME: It would be easy and harmless to use the target shuffle mask
   // extraction tool to support more.
   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
     return SDValue();
 
   MVT VT = N->getSimpleValueType(0);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
     return SDValue();
 
   // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue FMAdd = Op0, FMSub = Op1;
   if (FMSub.getOpcode() != X86ISD::FMSUB)
     std::swap(FMAdd, FMSub);
 
   if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
       FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
       FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
       FMAdd.getOperand(2) != FMSub.getOperand(2))
     return SDValue();
 
   // Check for correct shuffle mask.
   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
   bool Op0Even;
   if (!isAddSubOrSubAddMask(Mask, Op0Even))
     return SDValue();
 
   // FMAddSub takes zeroth operand from FMSub node.
   SDLoc DL(N);
   bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
   unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
   return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
                      FMAdd.getOperand(2));
 }
 
 /// Try to combine a shuffle into a target-specific add-sub or
 /// mul-add-sub node.
 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
                                                 const X86Subtarget &Subtarget,
                                                 SelectionDAG &DAG) {
   if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
     return V;
 
   SDValue Opnd0, Opnd1;
   bool IsSubAdd;
   if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
     return SDValue();
 
   MVT VT = N->getSimpleValueType(0);
   SDLoc DL(N);
 
   // Try to generate X86ISD::FMADDSUB node here.
   SDValue Opnd2;
   if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
     unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
     return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
   }
 
   if (IsSubAdd)
     return SDValue();
 
   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
   // the ADDSUB idiom has been successfully recognized. There are no known
   // X86 targets with 512-bit ADDSUB instructions!
   if (VT.is512BitVector())
     return SDValue();
 
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
 // We are looking for a shuffle where both sources are concatenated with undef
 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
 // if we can express this as a single-source shuffle, that's preferable.
 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
                                            const X86Subtarget &Subtarget) {
   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
   if (!VT.is128BitVector() && !VT.is256BitVector())
     return SDValue();
 
   if (VT.getVectorElementType() != MVT::i32 &&
       VT.getVectorElementType() != MVT::i64 &&
       VT.getVectorElementType() != MVT::f32 &&
       VT.getVectorElementType() != MVT::f64)
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // Check that both sources are concats with undef.
   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
       !N1.getOperand(1).isUndef())
     return SDValue();
 
   // Construct the new shuffle mask. Elements from the first source retain their
   // index, but elements from the second source no longer need to skip an undef.
   SmallVector<int, 8> Mask;
   int NumElts = VT.getVectorNumElements();
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   for (int Elt : SVOp->getMask())
     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
 
   SDLoc DL(N);
   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
                                N1.getOperand(0));
   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
 }
 
 /// Eliminate a redundant shuffle of a horizontal math op.
 static SDValue foldShuffleOfHorizOp(SDNode *N) {
   if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
     return SDValue();
 
   SDValue HOp = N->getOperand(0);
   if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
       HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
     return SDValue();
 
   // 128-bit horizontal math instructions are defined to operate on adjacent
   // lanes of each operand as:
   // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
   // ...similarly for v2f64 and v8i16.
   // TODO: Handle UNDEF operands.
   if (HOp.getOperand(0) != HOp.getOperand(1))
     return SDValue();
 
   // When the operands of a horizontal math op are identical, the low half of
   // the result is the same as the high half. If the shuffle is also replicating
   // low and high halves, we don't need the shuffle.
   // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
   // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
   // but this should be tied to whatever horizontal op matching and shuffle
   // canonicalization are producing.
   if (HOp.getValueSizeInBits() == 128 &&
       (isTargetShuffleEquivalent(Mask, {0, 0}) ||
        isTargetShuffleEquivalent(Mask, {0, 1, 0, 1}) ||
        isTargetShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3})))
     return HOp;
 
   if (HOp.getValueSizeInBits() == 256 &&
       (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2}) ||
        isTargetShuffleEquivalent(Mask, {0, 1, 0, 1, 4, 5, 4, 5}) ||
        isTargetShuffleEquivalent(
            Mask, {0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11})))
     return HOp;
 
   return SDValue();
 }
 
 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
   if (TLI.isTypeLegal(VT)) {
     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
       return AddSub;
 
     if (SDValue HAddSub = foldShuffleOfHorizOp(N))
       return HAddSub;
   }
 
   // During Type Legalization, when promoting illegal vector types,
   // the backend might introduce new shuffle dag nodes and bitcasts.
   //
   // This code performs the following transformation:
   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
   //
   // We do this only if both the bitcast and the BINOP dag nodes have
   // one use. Also, perform this transformation only if the new binary
   // operation is legal. This is to avoid introducing dag nodes that
   // potentially need to be further expanded (or custom lowered) into a
   // less optimal sequence of dag nodes.
   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
       N->getOperand(0).getOpcode() == ISD::BITCAST &&
       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
 
     SDValue BC0 = N0.getOperand(0);
     EVT SVT = BC0.getValueType();
     unsigned Opcode = BC0.getOpcode();
     unsigned NumElts = VT.getVectorNumElements();
 
     if (BC0.hasOneUse() && SVT.isVector() &&
         SVT.getVectorNumElements() * 2 == NumElts &&
         TLI.isOperationLegal(Opcode, VT)) {
       bool CanFold = false;
       switch (Opcode) {
       default : break;
       case ISD::ADD:
       case ISD::SUB:
       case ISD::MUL:
         // isOperationLegal lies for integer ops on floating point types.
         CanFold = VT.isInteger();
         break;
       case ISD::FADD:
       case ISD::FSUB:
       case ISD::FMUL:
         // isOperationLegal lies for floating point ops on integer types.
         CanFold = VT.isFloatingPoint();
         break;
       }
 
       unsigned SVTNumElts = SVT.getVectorNumElements();
       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
         CanFold = SVOp->getMaskElt(i) < 0;
 
       if (CanFold) {
         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
       }
     }
   }
 
   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   // consecutive, non-overlapping, and in the right order.
   SmallVector<SDValue, 16> Elts;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
     if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
       Elts.push_back(Elt);
       continue;
     }
     Elts.clear();
     break;
   }
 
   if (Elts.size() == VT.getVectorNumElements())
     if (SDValue LD =
             EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
       return LD;
 
   // For AVX2, we sometimes want to combine
   // (vector_shuffle <mask> (concat_vectors t1, undef)
   //                        (concat_vectors t2, undef))
   // Into:
   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
     return ShufConcat;
 
   if (isTargetShuffle(N->getOpcode())) {
     SDValue Op(N, 0);
     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
       return Shuffle;
 
     // Try recursively combining arbitrary sequences of x86 shuffle
     // instructions into higher-order shuffles. We do this after combining
     // specific PSHUF instruction sequences into their minimal form so that we
     // can evaluate how many specialized shuffle instructions are involved in
     // a particular chain.
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
       return Res;
 
     // Simplify source operands based on shuffle mask.
     // TODO - merge this into combineX86ShufflesRecursively.
     APInt KnownUndef, KnownZero;
     APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI))
       return SDValue(N, 0);
   }
 
   // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
   // operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
   // FIXME: This can probably go away once we default to widening legalization.
   if (Subtarget.hasSSE41() && VT == MVT::v4i32 &&
       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
       N->getOperand(0).getOpcode() == ISD::BITCAST &&
       N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) {
     SDValue BC = N->getOperand(0);
     SDValue MULUDQ = BC.getOperand(0);
     ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
     ArrayRef<int> Mask = SVOp->getMask();
     if (BC.hasOneUse() && MULUDQ.hasOneUse() &&
         Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) {
       SDValue Op0 = MULUDQ.getOperand(0);
       SDValue Op1 = MULUDQ.getOperand(1);
       if (Op0.getOpcode() == ISD::BITCAST &&
           Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
           Op0.getOperand(0).getValueType() == MVT::v4i32) {
         ShuffleVectorSDNode *SVOp0 =
           cast<ShuffleVectorSDNode>(Op0.getOperand(0));
         ArrayRef<int> Mask2 = SVOp0->getMask();
         if (Mask2[0] == 0 && Mask2[1] == -1 &&
             Mask2[2] == 1 && Mask2[3] == -1) {
           Op0 = SVOp0->getOperand(0);
           Op1 = DAG.getBitcast(MVT::v4i32, Op1);
           Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask);
           return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
         }
       }
       if (Op1.getOpcode() == ISD::BITCAST &&
           Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
           Op1.getOperand(0).getValueType() == MVT::v4i32) {
         ShuffleVectorSDNode *SVOp1 =
           cast<ShuffleVectorSDNode>(Op1.getOperand(0));
         ArrayRef<int> Mask2 = SVOp1->getMask();
         if (Mask2[0] == 0 && Mask2[1] == -1 &&
             Mask2[2] == 1 && Mask2[3] == -1) {
           Op0 = DAG.getBitcast(MVT::v4i32, Op0);
           Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask);
           Op1 = SVOp1->getOperand(0);
           return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1);
         }
       }
     }
   }
 
   return SDValue();
 }
 
 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
     TargetLoweringOpt &TLO, unsigned Depth) const {
   int NumElts = DemandedElts.getBitWidth();
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
 
   // Handle special case opcodes.
   switch (Opc) {
   case X86ISD::VSHL:
   case X86ISD::VSRL:
   case X86ISD::VSRA: {
     // We only need the bottom 64-bits of the (128-bit) shift amount.
     SDValue Amt = Op.getOperand(1);
     MVT AmtVT = Amt.getSimpleValueType();
     assert(AmtVT.is128BitVector() && "Unexpected value type");
     APInt AmtUndef, AmtZero;
     unsigned NumAmtElts = AmtVT.getVectorNumElements();
     APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
     if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
                                    Depth + 1))
       return true;
     LLVM_FALLTHROUGH;
   }
   case X86ISD::VSHLI:
   case X86ISD::VSRLI:
   case X86ISD::VSRAI: {
     SDValue Src = Op.getOperand(0);
     APInt SrcUndef;
     if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
                                    Depth + 1))
       return true;
     // TODO convert SrcUndef to KnownUndef.
     break;
   }
   case X86ISD::CVTSI2P:
   case X86ISD::CVTUI2P: {
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
     APInt SrcUndef, SrcZero;
     APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
                                    Depth + 1))
       return true;
     break;
   }
   case X86ISD::PACKSS:
   case X86ISD::PACKUS: {
     APInt DemandedLHS, DemandedRHS;
     getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
 
     APInt SrcUndef, SrcZero;
     if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedLHS, SrcUndef,
                                    SrcZero, TLO, Depth + 1))
       return true;
     if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedRHS, SrcUndef,
                                    SrcZero, TLO, Depth + 1))
       return true;
     break;
   }
   case X86ISD::VBROADCAST: {
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
     if (!SrcVT.isVector())
       return false;
     // Don't bother broadcasting if we just need the 0'th element.
     if (DemandedElts == 1) {
       if(Src.getValueType() != VT)
         Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
                              SDLoc(Op));
       return TLO.CombineTo(Op, Src);
     }
     APInt SrcUndef, SrcZero;
     APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
                                    Depth + 1))
       return true;
     break;
   }
   case X86ISD::PSHUFB: {
     // TODO - simplify other variable shuffle masks.
     SDValue Mask = Op.getOperand(1);
     APInt MaskUndef, MaskZero;
     if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
                                    Depth + 1))
       return true;
     break;
   }
   }
 
   // Simplify target shuffles.
   if (!isTargetShuffle(Opc) || !VT.isSimple())
     return false;
 
   // Get target shuffle mask.
   bool IsUnary;
   SmallVector<int, 64> OpMask;
   SmallVector<SDValue, 2> OpInputs;
   if (!getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, OpInputs,
                             OpMask, IsUnary))
     return false;
 
   // Shuffle inputs must be the same type as the result.
   if (llvm::any_of(OpInputs,
                    [VT](SDValue V) { return VT != V.getValueType(); }))
     return false;
 
   // Clear known elts that might have been set above.
   KnownZero.clearAllBits();
   KnownUndef.clearAllBits();
 
   // Check if shuffle mask can be simplified to undef/zero/identity.
   int NumSrcs = OpInputs.size();
   for (int i = 0; i != NumElts; ++i) {
     int &M = OpMask[i];
     if (!DemandedElts[i])
       M = SM_SentinelUndef;
     else if (0 <= M && OpInputs[M / NumElts].isUndef())
       M = SM_SentinelUndef;
   }
 
   if (isUndefInRange(OpMask, 0, NumElts)) {
     KnownUndef.setAllBits();
     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
   }
   if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
     KnownZero.setAllBits();
     return TLO.CombineTo(
         Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
   }
   for (int Src = 0; Src != NumSrcs; ++Src)
     if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
       return TLO.CombineTo(Op, OpInputs[Src]);
 
   // Attempt to simplify inputs.
   for (int Src = 0; Src != NumSrcs; ++Src) {
     int Lo = Src * NumElts;
     APInt SrcElts = APInt::getNullValue(NumElts);
     for (int i = 0; i != NumElts; ++i)
       if (DemandedElts[i]) {
         int M = OpMask[i] - Lo;
         if (0 <= M && M < NumElts)
           SrcElts.setBit(M);
       }
 
     APInt SrcUndef, SrcZero;
     if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
                                    TLO, Depth + 1))
       return true;
   }
 
   // Extract known zero/undef elements.
   // TODO - Propagate input undef/zero elts.
   for (int i = 0; i != NumElts; ++i) {
     if (OpMask[i] == SM_SentinelUndef)
       KnownUndef.setBit(i);
     if (OpMask[i] == SM_SentinelZero)
       KnownZero.setBit(i);
   }
 
   return false;
 }
 
 bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
     SDValue Op, const APInt &OriginalDemandedBits,
     const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
     unsigned Depth) const {
   EVT VT = Op.getValueType();
   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
   unsigned Opc = Op.getOpcode();
   switch(Opc) {
   case X86ISD::PMULDQ:
   case X86ISD::PMULUDQ: {
     // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
     KnownBits KnownOp;
     SDValue LHS = Op.getOperand(0);
     SDValue RHS = Op.getOperand(1);
     // FIXME: Can we bound this better?
     APInt DemandedMask = APInt::getLowBitsSet(64, 32);
     if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
       return true;
     if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
       return true;
     break;
   }
   case X86ISD::VSHLI: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
       if (ShiftImm->getAPIntValue().uge(BitWidth))
         break;
 
       unsigned ShAmt = ShiftImm->getZExtValue();
       APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
 
       // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
       // single shift.  We can do this if the bottom bits (which are shifted
       // out) are never demanded.
       if (Op0.getOpcode() == X86ISD::VSRLI &&
           OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
         if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {
           if (Shift2Imm->getAPIntValue().ult(BitWidth)) {
             int Diff = ShAmt - Shift2Imm->getZExtValue();
             if (Diff == 0)
               return TLO.CombineTo(Op, Op0.getOperand(0));
 
             unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
             SDValue NewShift = TLO.DAG.getNode(
                 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
                 TLO.DAG.getConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
             return TLO.CombineTo(Op, NewShift);
           }
         }
       }
 
       if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
                                TLO, Depth + 1))
         return true;
 
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero <<= ShAmt;
       Known.One <<= ShAmt;
 
       // Low bits known zero.
       Known.Zero.setLowBits(ShAmt);
     }
     break;
   }
   case X86ISD::VSRLI: {
     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       if (ShiftImm->getAPIntValue().uge(BitWidth))
         break;
 
       unsigned ShAmt = ShiftImm->getZExtValue();
       APInt DemandedMask = OriginalDemandedBits << ShAmt;
 
       if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
                                OriginalDemandedElts, Known, TLO, Depth + 1))
         return true;
 
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShAmt);
       Known.One.lshrInPlace(ShAmt);
 
       // High bits known zero.
       Known.Zero.setHighBits(ShAmt);
     }
     break;
   }
   case X86ISD::VSRAI: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
 
     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
       if (ShiftImm->getAPIntValue().uge(BitWidth))
         break;
 
       unsigned ShAmt = ShiftImm->getZExtValue();
       APInt DemandedMask = OriginalDemandedBits << ShAmt;
 
       // If we just want the sign bit then we don't need to shift it.
       if (OriginalDemandedBits.isSignMask())
         return TLO.CombineTo(Op, Op0);
 
       // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
       if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
         SDValue Op00 = Op0.getOperand(0);
         unsigned NumSignBits =
             TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
         if (ShAmt < NumSignBits)
           return TLO.CombineTo(Op, Op00);
       }
 
       // If any of the demanded bits are produced by the sign extension, we also
       // demand the input sign bit.
       if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
         DemandedMask.setSignBit();
 
       if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
                                TLO, Depth + 1))
         return true;
 
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShAmt);
       Known.One.lshrInPlace(ShAmt);
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
       if (Known.Zero[BitWidth - ShAmt - 1] ||
           OriginalDemandedBits.countLeadingZeros() >= ShAmt)
         return TLO.CombineTo(
             Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
 
       // High bits are known one.
       if (Known.One[BitWidth - ShAmt - 1])
         Known.One.setHighBits(ShAmt);
     }
     break;
   }
   case X86ISD::MOVMSK: {
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
     unsigned SrcBits = SrcVT.getScalarSizeInBits();
     unsigned NumElts = SrcVT.getVectorNumElements();
 
     // If we don't need the sign bits at all just return zero.
     if (OriginalDemandedBits.countTrailingZeros() >= NumElts)
       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
 
     // Only demand the vector elements of the sign bits we need.
     APInt KnownUndef, KnownZero;
     APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
     if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
                                    TLO, Depth + 1))
       return true;
 
     Known.Zero = KnownZero.zextOrSelf(BitWidth);
     Known.Zero.setHighBits(BitWidth - NumElts);
 
     // MOVMSK only uses the MSB from each vector element.
     KnownBits KnownSrc;
     if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
                              KnownSrc, TLO, Depth + 1))
       return true;
 
     if (KnownSrc.One[SrcBits - 1])
       Known.One.setLowBits(NumElts);
     else if (KnownSrc.Zero[SrcBits - 1])
       Known.Zero.setLowBits(NumElts);
     return false;
   }
   }
 
   return TargetLowering::SimplifyDemandedBitsForTargetNode(
       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
 }
 
 /// Check if a vector extract from a target-specific shuffle of a load can be
 /// folded into a single element load.
 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
 /// shuffles have been custom lowered so we need to handle those here.
 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDValue InVec = N->getOperand(0);
   SDValue EltNo = N->getOperand(1);
   EVT EltVT = N->getValueType(0);
 
   if (!isa<ConstantSDNode>(EltNo))
     return SDValue();
 
   EVT OriginalVT = InVec.getValueType();
 
   // Peek through bitcasts, don't duplicate a load with other uses.
   InVec = peekThroughOneUseBitcasts(InVec);
 
   EVT CurrentVT = InVec.getValueType();
   if (!CurrentVT.isVector() ||
       CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
     return SDValue();
 
   if (!isTargetShuffle(InVec.getOpcode()))
     return SDValue();
 
   // Don't duplicate a load with other uses.
   if (!InVec.hasOneUse())
     return SDValue();
 
   SmallVector<int, 16> ShuffleMask;
   SmallVector<SDValue, 2> ShuffleOps;
   bool UnaryShuffle;
   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
                             ShuffleOps, ShuffleMask, UnaryShuffle))
     return SDValue();
 
   // Select the input vector, guarding against out of range extract vector.
   unsigned NumElems = CurrentVT.getVectorNumElements();
   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
 
   if (Idx == SM_SentinelZero)
     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
   if (Idx == SM_SentinelUndef)
     return DAG.getUNDEF(EltVT);
 
   // Bail if any mask element is SM_SentinelZero - getVectorShuffle below
   // won't handle it.
   if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
     return SDValue();
 
   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] : ShuffleOps[1];
 
   // If inputs to shuffle are the same for both ops, then allow 2 uses
   unsigned AllowedUses =
       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
 
   if (LdNode.getOpcode() == ISD::BITCAST) {
     // Don't duplicate a load with other uses.
     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
       return SDValue();
 
     AllowedUses = 1; // only allow 1 load use if we have a bitcast
     LdNode = LdNode.getOperand(0);
   }
 
   if (!ISD::isNormalLoad(LdNode.getNode()))
     return SDValue();
 
   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
 
   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
     return SDValue();
 
   // If there's a bitcast before the shuffle, check if the load type and
   // alignment is valid.
   unsigned Align = LN0->getAlignment();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
       EltVT.getTypeForEVT(*DAG.getContext()));
 
   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
     return SDValue();
 
   // All checks match so transform back to vector_shuffle so that DAG combiner
   // can finish the job
   SDLoc dl(N);
 
   // Create shuffle node taking into account the case that its a unary shuffle
   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
                                  ShuffleMask);
   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
                      EltNo);
 }
 
 // Try to match patterns such as
 // (i16 bitcast (v16i1 x))
 // ->
 // (i16 movmsk (16i8 sext (v16i1 x)))
 // before the illegal vector is scalarized on subtargets that don't have legal
 // vxi1 types.
 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
                                   const X86Subtarget &Subtarget) {
   EVT VT = BitCast.getValueType();
   SDValue N0 = BitCast.getOperand(0);
   EVT VecVT = N0->getValueType(0);
 
   if (!VT.isScalarInteger() || !VecVT.isSimple())
     return SDValue();
 
   // If the input is a truncate from v16i8 or v32i8 go ahead and use a
   // movmskb even with avx512. This will be better than truncating to vXi1 and
   // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
   // vpcmpeqb/vpcmpgtb.
   bool IsTruncated = N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
                      (N0.getOperand(0).getValueType() == MVT::v16i8 ||
                       N0.getOperand(0).getValueType() == MVT::v32i8 ||
                       N0.getOperand(0).getValueType() == MVT::v64i8);
 
   // With AVX512 vxi1 types are legal and we prefer using k-regs.
   // MOVMSK is supported in SSE2 or later.
   if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
     return SDValue();
 
   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
   // v8i16 and v16i16.
   // For these two cases, we can shuffle the upper element bytes to a
   // consecutive sequence at the start of the vector and treat the results as
   // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
   // for v16i16 this is not the case, because the shuffle is expensive, so we
   // avoid sign-extending to this type entirely.
   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
   MVT SExtVT;
   switch (VecVT.getSimpleVT().SimpleTy) {
   default:
     return SDValue();
   case MVT::v2i1:
     SExtVT = MVT::v2i64;
     break;
   case MVT::v4i1:
     SExtVT = MVT::v4i32;
     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
     // sign-extend to a 256-bit operation to avoid truncation.
     if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
         N0->getOperand(0).getValueType().is256BitVector()) {
       SExtVT = MVT::v4i64;
     }
     break;
   case MVT::v8i1:
     SExtVT = MVT::v8i16;
     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
     // sign-extend to a 256-bit operation to match the compare.
     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
     // 256-bit because the shuffle is cheaper than sign extending the result of
     // the compare.
     if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
         (N0->getOperand(0).getValueType().is256BitVector() ||
          N0->getOperand(0).getValueType().is512BitVector())) {
       SExtVT = MVT::v8i32;
     }
     break;
   case MVT::v16i1:
     SExtVT = MVT::v16i8;
     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
     // it is not profitable to sign-extend to 256-bit because this will
     // require an extra cross-lane shuffle which is more expensive than
     // truncating the result of the compare to 128-bits.
     break;
   case MVT::v32i1:
     SExtVT = MVT::v32i8;
     break;
   case MVT::v64i1:
     // If we have AVX512F, but not AVX512BW and the input is truncated from
     // v64i8 checked earlier. Then split the input and make two pmovmskbs.
     if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
       SExtVT = MVT::v64i8;
       break;
     }
     return SDValue();
   };
 
   SDLoc DL(BitCast);
   SDValue V = DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, N0);
 
   if (SExtVT == MVT::v64i8) {
     SDValue Lo, Hi;
     std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
     Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
     Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
     Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
     Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
     Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
                      DAG.getConstant(32, DL, MVT::i8));
     V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
   } else if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
     V = getPMOVMSKB(DL, V, DAG, Subtarget);
   } else {
     if (SExtVT == MVT::v8i16)
       V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
                       DAG.getUNDEF(MVT::v8i16));
     V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
   }
   return DAG.getZExtOrTrunc(V, DL, VT);
 }
 
 // Convert a vXi1 constant build vector to the same width scalar integer.
 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
   EVT SrcVT = Op.getValueType();
   assert(SrcVT.getVectorElementType() == MVT::i1 &&
          "Expected a vXi1 vector");
   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
          "Expected a constant build vector");
 
   APInt Imm(SrcVT.getVectorNumElements(), 0);
   for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
     SDValue In = Op.getOperand(Idx);
     if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
       Imm.setBit(Idx);
   }
   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
   return DAG.getConstant(Imm, SDLoc(Op), IntVT);
 }
 
 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
 
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // Only do this if we have k-registers.
   if (!Subtarget.hasAVX512())
     return SDValue();
 
   EVT DstVT = N->getValueType(0);
   SDValue Op = N->getOperand(0);
   EVT SrcVT = Op.getValueType();
 
   if (!Op.hasOneUse())
     return SDValue();
 
   // Look for logic ops.
   if (Op.getOpcode() != ISD::AND &&
       Op.getOpcode() != ISD::OR &&
       Op.getOpcode() != ISD::XOR)
     return SDValue();
 
   // Make sure we have a bitcast between mask registers and a scalar type.
   if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
         DstVT.isScalarInteger()) &&
       !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
         SrcVT.isScalarInteger()))
     return SDValue();
 
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
 
   if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
       LHS.getOperand(0).getValueType() == DstVT)
     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
                        DAG.getBitcast(DstVT, RHS));
 
   if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
       RHS.getOperand(0).getValueType() == DstVT)
     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
                        DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
 
   // If the RHS is a vXi1 build vector, this is a good reason to flip too.
   // Most of these have to move a constant from the scalar domain anyway.
   if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
     RHS = combinevXi1ConstantToInteger(RHS, DAG);
     return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
                        DAG.getBitcast(DstVT, LHS), RHS);
   }
 
   return SDValue();
 }
 
 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   unsigned NumElts = N.getNumOperands();
 
   auto *BV = cast<BuildVectorSDNode>(N);
   SDValue Splat = BV->getSplatValue();
 
   // Build MMX element from integer GPR or SSE float values.
   auto CreateMMXElement = [&](SDValue V) {
     if (V.isUndef())
       return DAG.getUNDEF(MVT::x86mmx);
     if (V.getValueType().isFloatingPoint()) {
       if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
         V = DAG.getBitcast(MVT::v2i64, V);
         return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
       }
       V = DAG.getBitcast(MVT::i32, V);
     } else {
       V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
     }
     return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
   };
 
   // Convert build vector ops to MMX data in the bottom elements.
   SmallVector<SDValue, 8> Ops;
 
   // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
   if (Splat) {
     if (Splat.isUndef())
       return DAG.getUNDEF(MVT::x86mmx);
 
     Splat = CreateMMXElement(Splat);
 
     if (Subtarget.hasSSE1()) {
       // Unpack v8i8 to splat i8 elements to lowest 16-bits.
       if (NumElts == 8)
         Splat = DAG.getNode(
             ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
             DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
             Splat);
 
       // Use PSHUFW to repeat 16-bit elements.
       unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
       return DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
           DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
           DAG.getConstant(ShufMask, DL, MVT::i8));
     }
     Ops.append(NumElts, Splat);
   } else {
     for (unsigned i = 0; i != NumElts; ++i)
       Ops.push_back(CreateMMXElement(N.getOperand(i)));
   }
 
   // Use tree of PUNPCKLs to build up general MMX vector.
   while (Ops.size() > 1) {
     unsigned NumOps = Ops.size();
     unsigned IntrinOp =
         (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
                      : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
                                     : Intrinsic::x86_mmx_punpcklbw));
     SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
     for (unsigned i = 0; i != NumOps; i += 2)
       Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
                                Ops[i], Ops[i + 1]);
     Ops.resize(NumOps / 2);
   }
 
   return Ops[0];
 }
 
 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SrcVT = N0.getValueType();
 
   // Try to match patterns such as
   // (i16 bitcast (v16i1 x))
   // ->
   // (i16 movmsk (16i8 sext (v16i1 x)))
   // before the setcc result is scalarized on subtargets that don't have legal
   // vxi1 types.
   if (DCI.isBeforeLegalize()) {
     if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
       return V;
 
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
         Subtarget.hasAVX512()) {
       SDLoc dl(N);
       N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
       N0 = DAG.getBitcast(MVT::v8i1, N0);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
                          DAG.getIntPtrConstant(0, dl));
     }
 
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
         Subtarget.hasAVX512()) {
       SDLoc dl(N);
       unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
       SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
       Ops[0] = N0;
       N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
       N0 = DAG.getBitcast(MVT::i8, N0);
       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
     }
   }
 
   // Since MMX types are special and don't usually play with other vector types,
   // it's better to handle them early to be sure we emit efficient code by
   // avoiding store-load conversions.
   if (VT == MVT::x86mmx) {
     // Detect MMX constant vectors.
     APInt UndefElts;
     SmallVector<APInt, 1> EltBits;
     if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
       SDLoc DL(N0);
       // Handle zero-extension of i32 with MOVD.
       if (EltBits[0].countLeadingZeros() >= 32)
         return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
                            DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
       // Else, bitcast to a double.
       // TODO - investigate supporting sext 32-bit immediates on x86_64.
       APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
       return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
     }
 
     // Detect bitcasts to x86mmx low word.
     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
         (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
         N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
       bool LowUndef = true, AllUndefOrZero = true;
       for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N0.getOperand(i);
         LowUndef &= Op.isUndef() || (i >= e/2);
         AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
       }
       if (AllUndefOrZero) {
         SDValue N00 = N0.getOperand(0);
         SDLoc dl(N00);
         N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
                        : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
         return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
       }
     }
 
     // Detect bitcasts of 64-bit build vectors and convert to a
     // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
     // lowest element.
     if (N0.getOpcode() == ISD::BUILD_VECTOR &&
         (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
          SrcVT == MVT::v8i8))
       return createMMXBuildVector(N0, DAG, Subtarget);
 
     // Detect bitcasts between element or subvector extraction to x86mmx.
     if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
          N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
         isNullConstant(N0.getOperand(1))) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getValueType().is128BitVector())
         return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
                            DAG.getBitcast(MVT::v2i64, N00));
     }
 
     // Detect bitcasts from FP_TO_SINT to x86mmx.
     if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
       SDLoc DL(N0);
       SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
                                 DAG.getUNDEF(MVT::v2i32));
       return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
                          DAG.getBitcast(MVT::v2i64, Res));
     }
   }
 
   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
   // most of these to scalar anyway.
   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
       SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
     return combinevXi1ConstantToInteger(N0, DAG);
   }
 
   if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
       VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
       isa<ConstantSDNode>(N0)) {
     auto *C = cast<ConstantSDNode>(N0);
     if (C->isAllOnesValue())
       return DAG.getConstant(1, SDLoc(N0), VT);
     if (C->isNullValue())
       return DAG.getConstant(0, SDLoc(N0), VT);
   }
 
   // Try to remove bitcasts from input and output of mask arithmetic to
   // remove GPR<->K-register crossings.
   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
     return V;
 
   // Convert a bitcasted integer logic operation that has one bitcasted
   // floating-point operand into a floating-point logic operation. This may
   // create a load of a constant, but that is cheaper than materializing the
   // constant in an integer register and transferring it to an SSE register or
   // transferring the SSE operand to integer register and back.
   unsigned FPOpcode;
   switch (N0.getOpcode()) {
     case ISD::AND: FPOpcode = X86ISD::FAND; break;
     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
     default: return SDValue();
   }
 
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64)))
     return SDValue();
 
   SDValue LogicOp0 = N0.getOperand(0);
   SDValue LogicOp1 = N0.getOperand(1);
   SDLoc DL0(N0);
 
   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
     return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
   }
   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
     return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
   }
 
   return SDValue();
 }
 
 // Given a select, detect the following pattern:
 // 1:    %2 = zext <N x i8> %0 to <N x i32>
 // 2:    %3 = zext <N x i8> %1 to <N x i32>
 // 3:    %4 = sub nsw <N x i32> %2, %3
 // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
 // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
 // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
 // This is useful as it is the input into a SAD pattern.
 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
                               SDValue &Op1) {
   // Check the condition of the select instruction is greater-than.
   SDValue SetCC = Select->getOperand(0);
   if (SetCC.getOpcode() != ISD::SETCC)
     return false;
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   if (CC != ISD::SETGT && CC != ISD::SETLT)
     return false;
 
   SDValue SelectOp1 = Select->getOperand(1);
   SDValue SelectOp2 = Select->getOperand(2);
 
   // The following instructions assume SelectOp1 is the subtraction operand
   // and SelectOp2 is the negation operand.
   // In the case of SETLT this is the other way around.
   if (CC == ISD::SETLT)
     std::swap(SelectOp1, SelectOp2);
 
   // The second operand of the select should be the negation of the first
   // operand, which is implemented as 0 - SelectOp1.
   if (!(SelectOp2.getOpcode() == ISD::SUB &&
         ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
         SelectOp2.getOperand(1) == SelectOp1))
     return false;
 
   // The first operand of SetCC is the first operand of the select, which is the
   // difference between the two input vectors.
   if (SetCC.getOperand(0) != SelectOp1)
     return false;
 
   // In SetLT case, The second operand of the comparison can be either 1 or 0.
   APInt SplatVal;
   if ((CC == ISD::SETLT) &&
       !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
          SplatVal.isOneValue()) ||
         (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
     return false;
 
   // In SetGT case, The second operand of the comparison can be either -1 or 0.
   if ((CC == ISD::SETGT) &&
       !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
     return false;
 
   // The first operand of the select is the difference between the two input
   // vectors.
   if (SelectOp1.getOpcode() != ISD::SUB)
     return false;
 
   Op0 = SelectOp1.getOperand(0);
   Op1 = SelectOp1.getOperand(1);
 
   // Check if the operands of the sub are zero-extended from vectors of i8.
   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
       Op1.getOpcode() != ISD::ZERO_EXTEND ||
       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
     return false;
 
   return true;
 }
 
 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
 // to these zexts.
 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
                             const SDValue &Zext1, const SDLoc &DL,
                             const X86Subtarget &Subtarget) {
   // Find the appropriate width for the PSADBW.
   EVT InVT = Zext0.getOperand(0).getValueType();
   unsigned RegSize = std::max(128u, InVT.getSizeInBits());
 
   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
   // fill in the missing vector elements with 0.
   unsigned NumConcat = RegSize / InVT.getSizeInBits();
   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
   Ops[0] = Zext0.getOperand(0);
   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
   Ops[0] = Zext1.getOperand(0);
   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
 
   // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
   auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
     MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
     return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
   };
   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
   return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
                           PSADBWBuilder);
 }
 
 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
 // PHMINPOSUW.
 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
                                              const X86Subtarget &Subtarget) {
   // Bail without SSE41.
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   EVT ExtractVT = Extract->getValueType(0);
   if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
     return SDValue();
 
   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
   ISD::NodeType BinOp;
   SDValue Src = DAG.matchBinOpReduction(
       Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
   if (!Src)
     return SDValue();
 
   EVT SrcVT = Src.getValueType();
   EVT SrcSVT = SrcVT.getScalarType();
   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
     return SDValue();
 
   SDLoc DL(Extract);
   SDValue MinPos = Src;
 
   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
   while (SrcVT.getSizeInBits() > 128) {
     unsigned NumElts = SrcVT.getVectorNumElements();
     unsigned NumSubElts = NumElts / 2;
     SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
     unsigned SubSizeInBits = SrcVT.getSizeInBits();
     SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
     SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
   }
   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
           (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
          "Unexpected value type");
 
   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
   // to flip the value accordingly.
   SDValue Mask;
   unsigned MaskEltsBits = ExtractVT.getSizeInBits();
   if (BinOp == ISD::SMAX)
     Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::SMIN)
     Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::UMAX)
     Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
 
   // For v16i8 cases we need to perform UMIN on pairs of byte elements,
   // shuffling each upper element down and insert zeros. This means that the
   // v16i8 UMIN will leave the upper element as zero, performing zero-extension
   // ready for the PHMINPOS.
   if (ExtractVT == MVT::i8) {
     SDValue Upper = DAG.getVectorShuffle(
         SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
         {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
     MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
   }
 
   // Perform the PHMINPOS on a v8i16 vector,
   MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
   MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
   MinPos = DAG.getBitcast(SrcVT, MinPos);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
 
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
                      DAG.getIntPtrConstant(0, DL));
 }
 
 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
                                                 SelectionDAG &DAG,
                                                 const X86Subtarget &Subtarget) {
   // Bail without SSE2 or with AVX512VL (which uses predicate registers).
   if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
     return SDValue();
 
   EVT ExtractVT = Extract->getValueType(0);
   unsigned BitWidth = ExtractVT.getSizeInBits();
   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
       ExtractVT != MVT::i8)
     return SDValue();
 
   // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
   ISD::NodeType BinOp;
   SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
   if (!Match)
     return SDValue();
 
   // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
   // which we can't support here for now.
   if (Match.getScalarValueSizeInBits() != BitWidth)
     return SDValue();
 
   // We require AVX2 for PMOVMSKB for v16i16/v32i8;
   unsigned MatchSizeInBits = Match.getValueSizeInBits();
   if (!(MatchSizeInBits == 128 ||
         (MatchSizeInBits == 256 &&
          ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
     return SDValue();
 
   // Don't bother performing this for 2-element vectors.
   if (Match.getValueType().getVectorNumElements() <= 2)
     return SDValue();
 
   // Check that we are extracting a reduction of all sign bits.
   if (DAG.ComputeNumSignBits(Match) != BitWidth)
     return SDValue();
 
   // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
   MVT MaskVT;
   if (64 == BitWidth || 32 == BitWidth)
     MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
                               MatchSizeInBits / BitWidth);
   else
     MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
 
   APInt CompareBits;
   ISD::CondCode CondCode;
   if (BinOp == ISD::OR) {
     // any_of -> MOVMSK != 0
     CompareBits = APInt::getNullValue(32);
     CondCode = ISD::CondCode::SETNE;
   } else {
     // all_of -> MOVMSK == ((1 << NumElts) - 1)
     CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
     CondCode = ISD::CondCode::SETEQ;
   }
 
   // Perform the select as i32/i64 and then truncate to avoid partial register
   // stalls.
   unsigned ResWidth = std::max(BitWidth, 32u);
   EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
   SDLoc DL(Extract);
   SDValue Zero = DAG.getConstant(0, DL, ResVT);
   SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
   SDValue Res = DAG.getBitcast(MaskVT, Match);
   Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
   Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
                         Ones, Zero, CondCode);
   return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
 }
 
 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   // PSADBW is only supported on SSE2 and up.
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   // Verify the type we're extracting from is any integer type above i16.
   EVT VT = Extract->getOperand(0).getValueType();
   if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
     return SDValue();
 
   unsigned RegSize = 128;
   if (Subtarget.useBWIRegs())
     RegSize = 512;
   else if (Subtarget.hasAVX())
     RegSize = 256;
 
   // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
   if (RegSize / VT.getVectorNumElements() < 8)
     return SDValue();
 
   // Match shuffle + add pyramid.
   ISD::NodeType BinOp;
   SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
 
   // The operand is expected to be zero extended from i8
   // (verified in detectZextAbsDiff).
   // In order to convert to i64 and above, additional any/zero/sign
   // extend is expected.
   // The zero extend from 32 bit has no mathematical effect on the result.
   // Also the sign extend is basically zero extend
   // (extends the sign bit which is zero).
   // So it is correct to skip the sign/zero extend instruction.
   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
     Root.getOpcode() == ISD::ZERO_EXTEND ||
     Root.getOpcode() == ISD::ANY_EXTEND))
     Root = Root.getOperand(0);
 
   // If there was a match, we want Root to be a select that is the root of an
   // abs-diff pattern.
   if (!Root || (Root.getOpcode() != ISD::VSELECT))
     return SDValue();
 
   // Check whether we have an abs-diff pattern feeding into the select.
   SDValue Zext0, Zext1;
   if (!detectZextAbsDiff(Root, Zext0, Zext1))
     return SDValue();
 
   // Create the SAD instruction.
   SDLoc DL(Extract);
   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
 
   // If the original vector was wider than 8 elements, sum over the results
   // in the SAD vector.
   unsigned Stages = Log2_32(VT.getVectorNumElements());
   MVT SadVT = SAD.getSimpleValueType();
   if (Stages > 3) {
     unsigned SadElems = SadVT.getVectorNumElements();
 
     for(unsigned i = Stages - 3; i > 0; --i) {
       SmallVector<int, 16> Mask(SadElems, -1);
       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
         Mask[j] = MaskEnd + j;
 
       SDValue Shuffle =
           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
     }
   }
 
   MVT Type = Extract->getSimpleValueType(0);
   unsigned TypeSizeInBits = Type.getSizeInBits();
   // Return the lowest TypeSizeInBits bits.
   MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
   SAD = DAG.getBitcast(ResVT, SAD);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
                      Extract->getOperand(1));
 }
 
 // Attempt to peek through a target shuffle and extract the scalar from the
 // source.
 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
                                          TargetLowering::DAGCombinerInfo &DCI,
                                          const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDValue Src = N->getOperand(0);
   SDValue Idx = N->getOperand(1);
 
   EVT VT = N->getValueType(0);
   EVT SrcVT = Src.getValueType();
   EVT SrcSVT = SrcVT.getVectorElementType();
   unsigned NumSrcElts = SrcVT.getVectorNumElements();
 
   // Don't attempt this for boolean mask vectors or unknown extraction indices.
   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
     return SDValue();
 
   // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
   if (X86ISD::VBROADCAST == Src.getOpcode() &&
       Src.getOperand(0).getValueType() == VT)
     return Src.getOperand(0);
 
   // Resolve the target shuffle inputs and mask.
   SmallVector<int, 16> Mask;
   SmallVector<SDValue, 2> Ops;
   if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
     return SDValue();
 
   // Attempt to narrow/widen the shuffle mask to the correct size.
   if (Mask.size() != NumSrcElts) {
     if ((NumSrcElts % Mask.size()) == 0) {
       SmallVector<int, 16> ScaledMask;
       int Scale = NumSrcElts / Mask.size();
       scaleShuffleMask<int>(Scale, Mask, ScaledMask);
       Mask = std::move(ScaledMask);
     } else if ((Mask.size() % NumSrcElts) == 0) {
       // Simplify Mask based on demanded element.
       int ExtractIdx = (int)N->getConstantOperandVal(1);
       int Scale = Mask.size() / NumSrcElts;
       int Lo = Scale * ExtractIdx;
       int Hi = Scale * (ExtractIdx + 1);
       for (int i = 0, e = (int)Mask.size(); i != e; ++i)
         if (i < Lo || Hi <= i)
           Mask[i] = SM_SentinelUndef;
 
       SmallVector<int, 16> WidenedMask;
       while (Mask.size() > NumSrcElts &&
              canWidenShuffleElements(Mask, WidenedMask))
         Mask = std::move(WidenedMask);
       // TODO - investigate support for wider shuffle masks with known upper
       // undef/zero elements for implicit zero-extension.
     }
   }
 
   // Check if narrowing/widening failed.
   if (Mask.size() != NumSrcElts)
     return SDValue();
 
   int SrcIdx = Mask[N->getConstantOperandVal(1)];
   SDLoc dl(N);
 
   // If the shuffle source element is undef/zero then we can just accept it.
   if (SrcIdx == SM_SentinelUndef)
     return DAG.getUNDEF(VT);
 
   if (SrcIdx == SM_SentinelZero)
     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
                                 : DAG.getConstant(0, dl, VT);
 
   SDValue SrcOp = Ops[SrcIdx / Mask.size()];
   SrcOp = DAG.getBitcast(SrcVT, SrcOp);
   SrcIdx = SrcIdx % Mask.size();
 
   // We can only extract other elements from 128-bit vectors and in certain
   // circumstances, depending on SSE-level.
   // TODO: Investigate using extract_subvector for larger vectors.
   // TODO: Investigate float/double extraction if it will be just stored.
   if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
       ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
     assert(SrcSVT == VT && "Unexpected extraction type");
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
                        DAG.getIntPtrConstant(SrcIdx, dl));
   }
 
   if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
       (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
     assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
            "Unexpected extraction type");
     unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
     SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
                                 DAG.getIntPtrConstant(SrcIdx, dl));
     return DAG.getZExtOrTrunc(ExtOp, dl, VT);
   }
 
   return SDValue();
 }
 
 /// Detect vector gather/scatter index generation and convert it from being a
 /// bunch of shuffles and extracts into a somewhat faster sequence.
 /// For i686, the best sequence is apparently storing the value and loading
 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const X86Subtarget &Subtarget) {
   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
     return NewOp;
 
   // TODO - Remove this once we can handle the implicit zero-extension of
   // X86ISD::PEXTRW/X86ISD::PEXTRB in:
   // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
   // combineBasicSADPattern.
   if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return SDValue();
 
   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
     return NewOp;
 
   SDValue InputVector = N->getOperand(0);
   SDValue EltIdx = N->getOperand(1);
 
   EVT SrcVT = InputVector.getValueType();
   EVT VT = N->getValueType(0);
   SDLoc dl(InputVector);
 
   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
     SDValue MMXSrc = InputVector.getOperand(0);
 
     // The bitcast source is a direct mmx result.
     if (MMXSrc.getValueType() == MVT::x86mmx)
       return DAG.getBitcast(VT, InputVector);
   }
 
   // Detect mmx to i32 conversion through a v2i32 elt extract.
   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
     SDValue MMXSrc = InputVector.getOperand(0);
 
     // The bitcast source is a direct mmx result.
     if (MMXSrc.getValueType() == MVT::x86mmx)
       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
   }
 
   if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
       isa<ConstantSDNode>(EltIdx) &&
       isa<ConstantSDNode>(InputVector.getOperand(0))) {
     uint64_t ExtractedElt = N->getConstantOperandVal(1);
     auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
     const APInt &InputValue = InputC->getAPIntValue();
     uint64_t Res = InputValue[ExtractedElt];
     return DAG.getConstant(Res, dl, MVT::i1);
   }
 
   // Check whether this extract is the root of a sum of absolute differences
   // pattern. This has to be done here because we really want it to happen
   // pre-legalization,
   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
     return SAD;
 
   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
   if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
     return Cmp;
 
   // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
   if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
     return MinMax;
 
   return SDValue();
 }
 
 /// If a vector select has an operand that is -1 or 0, try to simplify the
 /// select to a bitwise logic operation.
 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
 static SDValue
 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   SDValue Cond = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   EVT VT = LHS.getValueType();
   EVT CondVT = Cond.getValueType();
   SDLoc DL(N);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (N->getOpcode() != ISD::VSELECT)
     return SDValue();
 
   assert(CondVT.isVector() && "Vector select expects a vector selector!");
 
   bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
   // Check if the first operand is all zeros and Cond type is vXi1.
   // This situation only applies to avx512.
   if (TValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
       CondVT.getVectorElementType() == MVT::i1) {
     // Invert the cond to not(cond) : xor(op,allones)=not(op)
     SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
   }
 
   // To use the condition operand as a bitwise mask, it must have elements that
   // are the same size as the select elements. Ie, the condition operand must
   // have already been promoted from the IR select condition type <N x i1>.
   // Don't check if the types themselves are equal because that excludes
   // vector floating-point selects.
   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
     return SDValue();
 
   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
 
   // Try to invert the condition if true value is not all 1s and false value is
   // not all 0s.
   if (!TValIsAllOnes && !FValIsAllZeros &&
       // Check if the selector will be produced by CMPP*/PCMP*.
       Cond.getOpcode() == ISD::SETCC &&
       // Check if SETCC has already been promoted.
       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
           CondVT) {
     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
 
     if (TValIsAllZeros || FValIsAllOnes) {
       SDValue CC = Cond.getOperand(2);
       ISD::CondCode NewCC =
           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                Cond.getOperand(0).getValueType().isInteger());
       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
                           NewCC);
       std::swap(LHS, RHS);
       TValIsAllOnes = FValIsAllOnes;
       FValIsAllZeros = TValIsAllZeros;
     }
   }
 
   // Cond value must be 'sign splat' to be converted to a logical op.
   if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
     return SDValue();
 
   // vselect Cond, 111..., 000... -> Cond
   if (TValIsAllOnes && FValIsAllZeros)
     return DAG.getBitcast(VT, Cond);
 
   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
     return SDValue();
 
   // vselect Cond, 111..., X -> or Cond, X
   if (TValIsAllOnes) {
     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
     return DAG.getBitcast(VT, Or);
   }
 
   // vselect Cond, X, 000... -> and Cond, X
   if (FValIsAllZeros) {
     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
     return DAG.getBitcast(VT, And);
   }
 
   // vselect Cond, 000..., X -> andn Cond, X
   if (TValIsAllZeros) {
     MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
     SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
     SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
     SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
     return DAG.getBitcast(VT, AndN);
   }
 
   return SDValue();
 }
 
 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
   SDValue Cond = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   SDLoc DL(N);
 
   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
   if (!TrueC || !FalseC)
     return SDValue();
 
   // Don't do this for crazy integer types.
   EVT VT = N->getValueType(0);
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   // We're going to use the condition bit in math or logic ops. We could allow
   // this with a wider condition value (post-legalization it becomes an i8),
   // but if nothing is creating selects that late, it doesn't matter.
   if (Cond.getValueType() != MVT::i1)
     return SDValue();
 
   // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
   // 3, 5, or 9 with i32/i64, so those get transformed too.
   // TODO: For constants that overflow or do not differ by power-of-2 or small
   // multiplier, convert to 'and' + 'add'.
   const APInt &TrueVal = TrueC->getAPIntValue();
   const APInt &FalseVal = FalseC->getAPIntValue();
   bool OV;
   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
   if (OV)
     return SDValue();
 
   APInt AbsDiff = Diff.abs();
   if (AbsDiff.isPowerOf2() ||
       ((VT == MVT::i32 || VT == MVT::i64) &&
        (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
 
     // We need a positive multiplier constant for shift/LEA codegen. The 'not'
     // of the condition can usually be folded into a compare predicate, but even
     // without that, the sequence should be cheaper than a CMOV alternative.
     if (TrueVal.slt(FalseVal)) {
       Cond = DAG.getNOT(DL, Cond, MVT::i1);
       std::swap(TrueC, FalseC);
     }
 
     // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
     SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
 
     // Multiply condition by the difference if non-one.
     if (!AbsDiff.isOneValue())
       R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
 
     // Add the base if non-zero.
     if (!FalseC->isNullValue())
       R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
 
     return R;
   }
 
   return SDValue();
 }
 
 /// If this is a *dynamic* select (non-constant condition) and we can match
 /// this node with one of the variable blend instructions, restructure the
 /// condition so that blends can use the high (sign) bit of each element.
 /// This function will also call SimplfiyDemandedBits on already created
 /// BLENDV to perform additional simplifications.
 static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
   SDValue Cond = N->getOperand(0);
   if ((N->getOpcode() != ISD::VSELECT &&
        N->getOpcode() != X86ISD::BLENDV) ||
       ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
     return SDValue();
 
   // Don't optimize before the condition has been transformed to a legal type
   // and don't ever optimize vector selects that map to AVX512 mask-registers.
   unsigned BitWidth = Cond.getScalarValueSizeInBits();
   if (BitWidth < 8 || BitWidth > 64)
     return SDValue();
 
   // We can only handle the cases where VSELECT is directly legal on the
   // subtarget. We custom lower VSELECT nodes with constant conditions and
   // this makes it hard to see whether a dynamic VSELECT will correctly
   // lower, so we both check the operation's status and explicitly handle the
   // cases where a *dynamic* blend will fail even though a constant-condition
   // blend could be custom lowered.
   // FIXME: We should find a better way to handle this class of problems.
   // Potentially, we should combine constant-condition vselect nodes
   // pre-legalization into shuffles and not mark as many types as custom
   // lowered.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = N->getValueType(0);
   if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
     return SDValue();
   // FIXME: We don't support i16-element blends currently. We could and
   // should support them by making *all* the bits in the condition be set
   // rather than just the high bit and using an i8-element blend.
   if (VT.getVectorElementType() == MVT::i16)
     return SDValue();
   // Dynamic blending was only available from SSE4.1 onward.
   if (VT.is128BitVector() && !Subtarget.hasSSE41())
     return SDValue();
   // Byte blends are only available in AVX2
   if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
     return SDValue();
   // There are no 512-bit blend instructions that use sign bits.
   if (VT.is512BitVector())
     return SDValue();
 
   // TODO: Add other opcodes eventually lowered into BLEND.
   for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
        UI != UE; ++UI)
     if ((UI->getOpcode() != ISD::VSELECT &&
          UI->getOpcode() != X86ISD::BLENDV) ||
         UI.getOperandNo() != 0)
       return SDValue();
 
   APInt DemandedMask(APInt::getSignMask(BitWidth));
   KnownBits Known;
   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                         !DCI.isBeforeLegalizeOps());
   if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
     return SDValue();
 
   // If we changed the computation somewhere in the DAG, this change will
   // affect all users of Cond. Update all the nodes so that we do not use
   // the generic VSELECT anymore. Otherwise, we may perform wrong
   // optimizations as we messed with the actual expectation for the vector
   // boolean values.
   for (SDNode *U : Cond->uses()) {
     if (U->getOpcode() == X86ISD::BLENDV)
       continue;
 
     SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
                              Cond, U->getOperand(1), U->getOperand(2));
     DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
     DCI.AddToWorklist(U);
   }
   DCI.CommitTargetLoweringOpt(TLO);
   return SDValue(N, 0);
 }
 
 /// Do target-specific dag combines on SELECT and VSELECT nodes.
 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue Cond = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
 
   // Try simplification again because we use this function to optimize
   // BLENDV nodes that are not handled by the generic combiner.
   if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
     return V;
 
   EVT VT = LHS.getValueType();
   EVT CondVT = Cond.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Convert vselects with constant condition into shuffles.
   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
       DCI.isBeforeLegalizeOps()) {
     SmallVector<int, 64> Mask;
     if (createShuffleMaskFromVSELECT(Mask, Cond))
       return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
   }
 
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
       VT != MVT::f80 && VT != MVT::f128 &&
       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget.hasSSE2() ||
        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     unsigned Opcode = 0;
     // Check for x CC y ? x : y.
     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
       switch (CC) {
       default: break;
       case ISD::SETULT:
         // Converting this to a min would handle NaNs incorrectly, and swapping
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
           if (!DAG.getTarget().Options.UnsafeFPMath &&
               !(DAG.isKnownNeverZeroFloat(LHS) ||
                 DAG.isKnownNeverZeroFloat(RHS)))
             break;
           std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETOLE:
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
           break;
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETULE:
         // Converting this to a min would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
         LLVM_FALLTHROUGH;
       case ISD::SETOLT:
       case ISD::SETLT:
       case ISD::SETLE:
         Opcode = X86ISD::FMIN;
         break;
 
       case ISD::SETOGE:
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
           break;
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETUGT:
         // Converting this to a max would handle NaNs incorrectly, and swapping
         // the operands would cause it to handle comparisons between positive
         // and negative zero incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
           if (!DAG.getTarget().Options.UnsafeFPMath &&
               !(DAG.isKnownNeverZeroFloat(LHS) ||
                 DAG.isKnownNeverZeroFloat(RHS)))
             break;
           std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETUGE:
         // Converting this to a max would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
         LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETGT:
       case ISD::SETGE:
         Opcode = X86ISD::FMAX;
         break;
       }
     // Check for x CC y ? y : x -- a min/max with reversed arms.
     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
       switch (CC) {
       default: break;
       case ISD::SETOGE:
         // Converting this to a min would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             !(DAG.isKnownNeverZeroFloat(LHS) ||
               DAG.isKnownNeverZeroFloat(RHS))) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
           std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETUGT:
         // Converting this to a min would handle NaNs incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
           break;
         Opcode = X86ISD::FMIN;
         break;
       case ISD::SETUGE:
         // Converting this to a min would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
         LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETGT:
       case ISD::SETGE:
         Opcode = X86ISD::FMIN;
         break;
 
       case ISD::SETULT:
         // Converting this to a max would handle NaNs incorrectly.
         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
           break;
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETOLE:
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly, and swapping the operands would
         // cause it to handle NaNs incorrectly.
         if (!DAG.getTarget().Options.UnsafeFPMath &&
             !DAG.isKnownNeverZeroFloat(LHS) &&
             !DAG.isKnownNeverZeroFloat(RHS)) {
           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
             break;
           std::swap(LHS, RHS);
         }
         Opcode = X86ISD::FMAX;
         break;
       case ISD::SETULE:
         // Converting this to a max would handle both negative zeros and NaNs
         // incorrectly, but we can swap the operands to fix both.
         std::swap(LHS, RHS);
         LLVM_FALLTHROUGH;
       case ISD::SETOLT:
       case ISD::SETLT:
       case ISD::SETLE:
         Opcode = X86ISD::FMAX;
         break;
       }
     }
 
     if (Opcode)
       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   }
 
   // Some mask scalar intrinsics rely on checking if only one bit is set
   // and implement it in C code like this:
   // A[0] = (U & 1) ? A[0] : W[0];
   // This creates some redundant instructions that break pattern matching.
   // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
   if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
       Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
     SDValue AndNode = Cond.getOperand(0);
     if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
         isNullConstant(Cond.getOperand(1)) &&
         isOneConstant(AndNode.getOperand(1))) {
       // LHS and RHS swapped due to
       // setcc outputting 1 when AND resulted in 0 and vice versa.
       AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
       return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
     }
   }
 
   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
   // lowering on KNL. In this case we convert it to
   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
   // The same situation all vectors of i8 and i16 without BWI.
   // Make sure we extend these even before type legalization gets a chance to
   // split wide vectors.
   // Since SKX these selects have a proper lowering.
   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
       CondVT.getVectorElementType() == MVT::i1 &&
       (ExperimentalVectorWideningLegalization ||
        VT.getVectorNumElements() > 4) &&
       (VT.getVectorElementType() == MVT::i8 ||
        VT.getVectorElementType() == MVT::i16)) {
     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
   }
 
   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
     return V;
 
   // Canonicalize max and min:
   // (x > y) ? x : y -> (x >= y) ? x : y
   // (x < y) ? x : y -> (x <= y) ? x : y
   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
   // the need for an extra compare
   // against zero. e.g.
   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
   // subl   %esi, %edi
   // testl  %edi, %edi
   // movl   $0, %eax
   // cmovgl %edi, %eax
   // =>
   // xorl   %eax, %eax
   // subl   %esi, $edi
   // cmovsl %eax, %edi
   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
     switch (CC) {
     default: break;
     case ISD::SETLT:
     case ISD::SETGT: {
       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
     }
     }
   }
 
   // Match VSELECTs into subs with unsigned saturation.
   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // psubus is available in SSE2 for i8 and i16 vectors.
       Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
       isPowerOf2_32(VT.getVectorNumElements()) &&
       (VT.getVectorElementType() == MVT::i8 ||
        VT.getVectorElementType() == MVT::i16)) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
     // left side invert the predicate to simplify logic below.
     SDValue Other;
     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
       Other = RHS;
       CC = ISD::getSetCCInverse(CC, true);
     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
       Other = LHS;
     }
 
     if (Other.getNode() && Other->getNumOperands() == 2 &&
         Other->getOperand(0) == Cond.getOperand(0)) {
       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
       SDValue CondRHS = Cond->getOperand(1);
 
       // Look for a general sub with unsigned saturation first.
       // x >= y ? x-y : 0 --> subus x, y
       // x >  y ? x-y : 0 --> subus x, y
       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
           Other->getOpcode() == ISD::SUB && OpRHS == CondRHS)
         return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
 
       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS)) {
         if (isa<BuildVectorSDNode>(CondRHS)) {
           // If the RHS is a constant we have to reverse the const
           // canonicalization.
           // x > C-1 ? x+-C : 0 --> subus x, C
           // TODO: Handle build_vectors with undef elements.
           auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
             return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
           };
           if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
               ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT)) {
             OpRHS = DAG.getNode(ISD::SUB, DL, VT,
                                 DAG.getConstant(0, DL, VT), OpRHS);
             return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
           }
 
           // Another special case: If C was a sign bit, the sub has been
           // canonicalized into a xor.
           // FIXME: Would it be better to use computeKnownBits to determine
           //        whether it's safe to decanonicalize the xor?
           // x s< 0 ? x^C : 0 --> subus x, C
           if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
             if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
                 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
                 OpRHSConst->getAPIntValue().isSignMask()) {
               // Note that we have to rebuild the RHS constant here to ensure we
               // don't rely on particular values of undef lanes.
               OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
               return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
             }
           }
         }
       }
     }
   }
 
   // Match VSELECTs into add with unsigned saturation.
   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // paddus is available in SSE2 for i8 and i16 vectors.
       Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 &&
       isPowerOf2_32(VT.getVectorNumElements()) &&
       (VT.getVectorElementType() == MVT::i8 ||
        VT.getVectorElementType() == MVT::i16)) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     SDValue CondLHS = Cond->getOperand(0);
     SDValue CondRHS = Cond->getOperand(1);
 
     // Check if one of the arms of the VSELECT is vector with all bits set.
     // If it's on the left side invert the predicate to simplify logic below.
     SDValue Other;
     if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
       Other = RHS;
       CC = ISD::getSetCCInverse(CC, true);
     } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
       Other = LHS;
     }
 
     if (Other.getNode() && Other.getOpcode() == ISD::ADD) {
       SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
 
       // Canonicalize condition operands.
       if (CC == ISD::SETUGE) {
         std::swap(CondLHS, CondRHS);
         CC = ISD::SETULE;
       }
 
       // We can test against either of the addition operands.
       // x <= x+y ? x+y : ~0 --> addus x, y
       // x+y >= x ? x+y : ~0 --> addus x, y
       if (CC == ISD::SETULE && Other == CondRHS &&
           (OpLHS == CondLHS || OpRHS == CondLHS))
         return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
 
       if (isa<BuildVectorSDNode>(OpRHS) && isa<BuildVectorSDNode>(CondRHS) &&
           CondLHS == OpLHS) {
         // If the RHS is a constant we have to reverse the const
         // canonicalization.
         // x > ~C ? x+C : ~0 --> addus x, C
         auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
           return Cond->getAPIntValue() == ~Op->getAPIntValue();
         };
         if (CC == ISD::SETULE &&
             ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
           return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
       }
     }
   }
 
   // Early exit check
   if (!TLI.isTypeLegal(VT))
     return SDValue();
 
   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
     return V;
 
   if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
     return V;
 
   // Custom action for SELECT MMX
   if (VT == MVT::x86mmx) {
     LHS = DAG.getBitcast(MVT::i64, LHS);
     RHS = DAG.getBitcast(MVT::i64, RHS);
     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
     return DAG.getBitcast(VT, newSelect);
   }
 
   return SDValue();
 }
 
 /// Combine:
 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
 /// to:
 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
 /// Note that this is only legal for some op/cc combinations.
 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
                                        SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget) {
   // This combine only operates on CMP-like nodes.
   if (!(Cmp.getOpcode() == X86ISD::CMP ||
         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
     return SDValue();
 
   // Can't replace the cmp if it has more uses than the one we're looking at.
   // FIXME: We would like to be able to handle this, but would need to make sure
   // all uses were updated.
   if (!Cmp.hasOneUse())
     return SDValue();
 
   // This only applies to variations of the common case:
   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
   // Using the proper condcodes (see below), overflow is checked for.
 
   // FIXME: We can generalize both constraints:
   // - XOR/OR/AND (if they were made to survive AtomicExpand)
   // - LHS != 1
   // if the result is compared.
 
   SDValue CmpLHS = Cmp.getOperand(0);
   SDValue CmpRHS = Cmp.getOperand(1);
 
   if (!CmpLHS.hasOneUse())
     return SDValue();
 
   unsigned Opc = CmpLHS.getOpcode();
   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
     return SDValue();
 
   SDValue OpRHS = CmpLHS.getOperand(2);
   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
   if (!OpRHSC)
     return SDValue();
 
   APInt Addend = OpRHSC->getAPIntValue();
   if (Opc == ISD::ATOMIC_LOAD_SUB)
     Addend = -Addend;
 
   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
   if (!CmpRHSC)
     return SDValue();
 
   APInt Comparison = CmpRHSC->getAPIntValue();
 
   // If the addend is the negation of the comparison value, then we can do
   // a full comparison by emitting the atomic arithmetic as a locked sub.
   if (Comparison == -Addend) {
     // The CC is fine, but we need to rewrite the LHS of the comparison as an
     // atomic sub.
     auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
     auto AtomicSub = DAG.getAtomic(
         ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
         /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
         /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
         AN->getMemOperand());
     auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
                                   DAG.getUNDEF(CmpLHS.getValueType()));
     DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
     return LockOp;
   }
 
   // We can handle comparisons with zero in a number of cases by manipulating
   // the CC used.
   if (!Comparison.isNullValue())
     return SDValue();
 
   if (CC == X86::COND_S && Addend == 1)
     CC = X86::COND_LE;
   else if (CC == X86::COND_NS && Addend == 1)
     CC = X86::COND_G;
   else if (CC == X86::COND_G && Addend == -1)
     CC = X86::COND_GE;
   else if (CC == X86::COND_LE && Addend == -1)
     CC = X86::COND_L;
   else
     return SDValue();
 
   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
                                 DAG.getUNDEF(CmpLHS.getValueType()));
   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
   return LockOp;
 }
 
 // Check whether a boolean test is testing a boolean value generated by
 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
 // code.
 //
 // Simplify the following patterns:
 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
 // to (Op EFLAGS Cond)
 //
 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
 // to (Op EFLAGS !Cond)
 //
 // where Op could be BRCOND or CMOV.
 //
 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   // This combine only operates on CMP-like nodes.
   if (!(Cmp.getOpcode() == X86ISD::CMP ||
         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
     return SDValue();
 
   // Quit if not used as a boolean value.
   if (CC != X86::COND_E && CC != X86::COND_NE)
     return SDValue();
 
   // Check CMP operands. One of them should be 0 or 1 and the other should be
   // an SetCC or extended from it.
   SDValue Op1 = Cmp.getOperand(0);
   SDValue Op2 = Cmp.getOperand(1);
 
   SDValue SetCC;
   const ConstantSDNode* C = nullptr;
   bool needOppositeCond = (CC == X86::COND_E);
   bool checkAgainstTrue = false; // Is it a comparison against 1?
 
   if ((C = dyn_cast<ConstantSDNode>(Op1)))
     SetCC = Op2;
   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
     SetCC = Op1;
   else // Quit if all operands are not constants.
     return SDValue();
 
   if (C->getZExtValue() == 1) {
     needOppositeCond = !needOppositeCond;
     checkAgainstTrue = true;
   } else if (C->getZExtValue() != 0)
     // Quit if the constant is neither 0 or 1.
     return SDValue();
 
   bool truncatedToBoolWithAnd = false;
   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
          SetCC.getOpcode() == ISD::TRUNCATE ||
          SetCC.getOpcode() == ISD::AND) {
     if (SetCC.getOpcode() == ISD::AND) {
       int OpIdx = -1;
       if (isOneConstant(SetCC.getOperand(0)))
         OpIdx = 1;
       if (isOneConstant(SetCC.getOperand(1)))
         OpIdx = 0;
       if (OpIdx < 0)
         break;
       SetCC = SetCC.getOperand(OpIdx);
       truncatedToBoolWithAnd = true;
     } else
       SetCC = SetCC.getOperand(0);
   }
 
   switch (SetCC.getOpcode()) {
   case X86ISD::SETCC_CARRY:
     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
     // truncated to i1 using 'and'.
     if (checkAgainstTrue && !truncatedToBoolWithAnd)
       break;
     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
            "Invalid use of SETCC_CARRY!");
     LLVM_FALLTHROUGH;
   case X86ISD::SETCC:
     // Set the condition code or opposite one if necessary.
     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
     if (needOppositeCond)
       CC = X86::GetOppositeBranchCondition(CC);
     return SetCC.getOperand(1);
   case X86ISD::CMOV: {
     // Check whether false/true value has canonical one, i.e. 0 or 1.
     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
     // Quit if true value is not a constant.
     if (!TVal)
       return SDValue();
     // Quit if false value is not a constant.
     if (!FVal) {
       SDValue Op = SetCC.getOperand(0);
       // Skip 'zext' or 'trunc' node.
       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
           Op.getOpcode() == ISD::TRUNCATE)
         Op = Op.getOperand(0);
       // A special case for rdrand/rdseed, where 0 is set if false cond is
       // found.
       if ((Op.getOpcode() != X86ISD::RDRAND &&
            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
         return SDValue();
     }
     // Quit if false value is not the constant 0 or 1.
     bool FValIsFalse = true;
     if (FVal && FVal->getZExtValue() != 0) {
       if (FVal->getZExtValue() != 1)
         return SDValue();
       // If FVal is 1, opposite cond is needed.
       needOppositeCond = !needOppositeCond;
       FValIsFalse = false;
     }
     // Quit if TVal is not the constant opposite of FVal.
     if (FValIsFalse && TVal->getZExtValue() != 1)
       return SDValue();
     if (!FValIsFalse && TVal->getZExtValue() != 0)
       return SDValue();
     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
     if (needOppositeCond)
       CC = X86::GetOppositeBranchCondition(CC);
     return SetCC.getOperand(3);
   }
   }
 
   return SDValue();
 }
 
 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
 /// Match:
 ///   (X86or (X86setcc) (X86setcc))
 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
                                            X86::CondCode &CC1, SDValue &Flags,
                                            bool &isAnd) {
   if (Cond->getOpcode() == X86ISD::CMP) {
     if (!isNullConstant(Cond->getOperand(1)))
       return false;
 
     Cond = Cond->getOperand(0);
   }
 
   isAnd = false;
 
   SDValue SetCC0, SetCC1;
   switch (Cond->getOpcode()) {
   default: return false;
   case ISD::AND:
   case X86ISD::AND:
     isAnd = true;
     LLVM_FALLTHROUGH;
   case ISD::OR:
   case X86ISD::OR:
     SetCC0 = Cond->getOperand(0);
     SetCC1 = Cond->getOperand(1);
     break;
   };
 
   // Make sure we have SETCC nodes, using the same flags value.
   if (SetCC0.getOpcode() != X86ISD::SETCC ||
       SetCC1.getOpcode() != X86ISD::SETCC ||
       SetCC0->getOperand(1) != SetCC1->getOperand(1))
     return false;
 
   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
   Flags = SetCC0->getOperand(1);
   return true;
 }
 
 // When legalizing carry, we create carries via add X, -1
 // If that comes from an actual carry, via setcc, we use the
 // carry directly.
 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
   if (EFLAGS.getOpcode() == X86ISD::ADD) {
     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
       SDValue Carry = EFLAGS.getOperand(0);
       while (Carry.getOpcode() == ISD::TRUNCATE ||
              Carry.getOpcode() == ISD::ZERO_EXTEND ||
              Carry.getOpcode() == ISD::SIGN_EXTEND ||
              Carry.getOpcode() == ISD::ANY_EXTEND ||
              (Carry.getOpcode() == ISD::AND &&
               isOneConstant(Carry.getOperand(1))))
         Carry = Carry.getOperand(0);
       if (Carry.getOpcode() == X86ISD::SETCC ||
           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
         if (Carry.getConstantOperandVal(0) == X86::COND_B)
           return Carry.getOperand(1);
       }
     }
   }
 
   return SDValue();
 }
 
 /// Optimize an EFLAGS definition used according to the condition code \p CC
 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
 /// uses of chain values.
 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
                                   SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
   if (CC == X86::COND_B)
     if (SDValue Flags = combineCarryThroughADD(EFLAGS))
       return Flags;
 
   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
     return R;
   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
 }
 
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   SDLoc DL(N);
 
   SDValue FalseOp = N->getOperand(0);
   SDValue TrueOp = N->getOperand(1);
   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
   SDValue Cond = N->getOperand(3);
 
   // Try to simplify the EFLAGS and condition code operands.
   // We can't always do this as FCMOV only supports a subset of X86 cond.
   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
         Flags};
       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
     }
   }
 
   // If this is a select between two integer constants, try to do some
   // optimizations.  Note that the operands are ordered the opposite of SELECT
   // operands.
   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
       // larger than FalseC (the false value).
       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueC, FalseC);
         std::swap(TrueOp, FalseOp);
       }
 
       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
       // This is efficient for any integer data type (including i8/i16) and
       // shift amount.
       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
         Cond = getSETCC(CC, Cond, DL, DAG);
 
         // Zero extend the condition if needed.
         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
 
         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
                            DAG.getConstant(ShAmt, DL, MVT::i8));
         return Cond;
       }
 
       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
       // for any integer data type, including i8/i16.
       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
         Cond = getSETCC(CC, Cond, DL, DAG);
 
         // Zero extend the condition if needed.
         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
                            FalseC->getValueType(0), Cond);
         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                            SDValue(FalseC, 0));
         return Cond;
       }
 
       // Optimize cases that will turn into an LEA instruction.  This requires
       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
 
         bool isFastMultiplier = false;
         if (Diff < 10) {
           switch ((unsigned char)Diff) {
           default: break;
           case 1:  // result = add base, cond
           case 2:  // result = lea base(    , cond*2)
           case 3:  // result = lea base(cond, cond*2)
           case 4:  // result = lea base(    , cond*4)
           case 5:  // result = lea base(cond, cond*4)
           case 8:  // result = lea base(    , cond*8)
           case 9:  // result = lea base(cond, cond*8)
             isFastMultiplier = true;
             break;
           }
         }
 
         if (isFastMultiplier) {
           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
           Cond = getSETCC(CC, Cond, DL ,DAG);
           // Zero extend the condition if needed.
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
                              Cond);
           // Scale the condition by the difference.
           if (Diff != 1)
             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
                                DAG.getConstant(Diff, DL, Cond.getValueType()));
 
           // Add the base if non-zero.
           if (FalseC->getAPIntValue() != 0)
             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                                SDValue(FalseC, 0));
           return Cond;
         }
       }
     }
   }
 
   // Handle these cases:
   //   (select (x != c), e, c) -> select (x != c), e, x),
   //   (select (x == c), c, e) -> select (x == c), x, e)
   // where the c is an integer constant, and the "select" is the combination
   // of CMOV and CMP.
   //
   // The rationale for this change is that the conditional-move from a constant
   // needs two instructions, however, conditional-move from a register needs
   // only one instruction.
   //
   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
   //  some instruction-combining opportunities. This opt needs to be
   //  postponed as late as possible.
   //
   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
     // the DCI.xxxx conditions are provided to postpone the optimization as
     // late as possible.
 
     ConstantSDNode *CmpAgainst = nullptr;
     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
         !isa<ConstantSDNode>(Cond.getOperand(0))) {
 
       if (CC == X86::COND_NE &&
           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueOp, FalseOp);
       }
 
       if (CC == X86::COND_E &&
           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
                           DAG.getConstant(CC, DL, MVT::i8), Cond };
         return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
       }
     }
   }
 
   // Fold and/or of setcc's to double CMOV:
   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
   //
   // This combine lets us generate:
   //   cmovcc1 (jcc1 if we don't have CMOV)
   //   cmovcc2 (same)
   // instead of:
   //   setcc1
   //   setcc2
   //   and/or
   //   cmovne (jne if we don't have CMOV)
   // When we can't use the CMOV instruction, it might increase branch
   // mispredicts.
   // When we can use CMOV, or when there is no mispredict, this improves
   // throughput and reduces register pressure.
   //
   if (CC == X86::COND_NE) {
     SDValue Flags;
     X86::CondCode CC0, CC1;
     bool isAndSetCC;
     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
       if (isAndSetCC) {
         std::swap(FalseOp, TrueOp);
         CC0 = X86::GetOppositeBranchCondition(CC0);
         CC1 = X86::GetOppositeBranchCondition(CC1);
       }
 
       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
         Flags};
       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
       return CMOV;
     }
   }
 
   // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
   //      (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
   // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
   //    (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
   if ((CC == X86::COND_NE || CC == X86::COND_E) &&
       Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
     SDValue Add = TrueOp;
     SDValue Const = FalseOp;
     // Canonicalize the condition code for easier matching and output.
     if (CC == X86::COND_E)
       std::swap(Add, Const);
 
     // We might have replaced the constant in the cmov with the LHS of the
     // compare. If so change it to the RHS of the compare.
     if (Const == Cond.getOperand(0))
       Const = Cond.getOperand(1);
 
     // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
     if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
         Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
       EVT VT = N->getValueType(0);
       // This should constant fold.
       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
       SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
                                  DAG.getConstant(X86::COND_NE, DL, MVT::i8),
                                  Cond);
       return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
     }
   }
 
   return SDValue();
 }
 
 /// Different mul shrinking modes.
 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
 
 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
   EVT VT = N->getOperand(0).getValueType();
   if (VT.getScalarSizeInBits() != 32)
     return false;
 
   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
   unsigned SignBits[2] = {1, 1};
   bool IsPositive[2] = {false, false};
   for (unsigned i = 0; i < 2; i++) {
     SDValue Opd = N->getOperand(i);
 
     SignBits[i] = DAG.ComputeNumSignBits(Opd);
     IsPositive[i] = DAG.SignBitIsZero(Opd);
   }
 
   bool AllPositive = IsPositive[0] && IsPositive[1];
   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
   // When ranges are from -128 ~ 127, use MULS8 mode.
   if (MinSignBits >= 25)
     Mode = MULS8;
   // When ranges are from 0 ~ 255, use MULU8 mode.
   else if (AllPositive && MinSignBits >= 24)
     Mode = MULU8;
   // When ranges are from -32768 ~ 32767, use MULS16 mode.
   else if (MinSignBits >= 17)
     Mode = MULS16;
   // When ranges are from 0 ~ 65535, use MULU16 mode.
   else if (AllPositive && MinSignBits >= 16)
     Mode = MULU16;
   else
     return false;
   return true;
 }
 
 /// When the operands of vector mul are extended from smaller size values,
 /// like i8 and i16, the type of mul may be shrinked to generate more
 /// efficient code. Two typical patterns are handled:
 /// Pattern1:
 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
 ///     %5 = mul <N x i32> %2, %4
 ///
 /// Pattern2:
 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
 ///     %5 = mul <N x i32> %2, %4
 ///
 /// There are four mul shrinking modes:
 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
 /// generate pmullw+sext32 for it (MULS8 mode).
 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
 /// generate pmullw+zext32 for it (MULU8 mode).
 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
 /// generate pmullw+pmulhw for it (MULS16 mode).
 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
 /// generate pmullw+pmulhuw for it (MULU16 mode).
 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   // Check for legality
   // pmullw/pmulhw are not supported by SSE.
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   // Check for profitability
   // pmulld is supported since SSE41. It is better to use pmulld
   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
   // the expansion.
   bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
     return SDValue();
 
   ShrinkMode Mode;
   if (!canReduceVMulWidth(N, DAG, Mode))
     return SDValue();
 
   SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getOperand(0).getValueType();
   unsigned NumElts = VT.getVectorNumElements();
   if ((NumElts % 2) != 0)
     return SDValue();
 
   unsigned RegSize = 128;
   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
 
   // Shrink the operands of mul.
   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
 
   if (ExperimentalVectorWideningLegalization ||
       NumElts >= OpsVT.getVectorNumElements()) {
     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
     // lower part is needed.
     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
     if (Mode == MULU8 || Mode == MULS8)
       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
                          DL, VT, MulLo);
 
     MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
     // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
     // the higher part is also needed.
     SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
                                 ReducedVT, NewN0, NewN1);
 
     // Repack the lower part and higher part result of mul into a wider
     // result.
     // Generate shuffle functioning as punpcklwd.
     SmallVector<int, 16> ShuffleMask(NumElts);
     for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
       ShuffleMask[2 * i] = i;
       ShuffleMask[2 * i + 1] = i + NumElts;
     }
     SDValue ResLo =
         DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
     ResLo = DAG.getBitcast(ResVT, ResLo);
     // Generate shuffle functioning as punpckhwd.
     for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
       ShuffleMask[2 * i] = i + NumElts / 2;
       ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
     }
     SDValue ResHi =
         DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
     ResHi = DAG.getBitcast(ResVT, ResHi);
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
   }
 
   // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
   // to legalize the mul explicitly because implicit legalization for type
   // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
   // instructions which will not exist when we explicitly legalize it by
   // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
   // <4 x i16> undef).
   //
   // Legalize the operands of mul.
   // FIXME: We may be able to handle non-concatenated vectors by insertion.
   unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
   if ((RegSize % ReducedSizeInBits) != 0)
     return SDValue();
 
   SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
                                DAG.getUNDEF(ReducedVT));
   Ops[0] = NewN0;
   NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
   Ops[0] = NewN1;
   NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
 
   if (Mode == MULU8 || Mode == MULS8) {
     // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
     // part is needed.
     SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
 
     // convert the type of mul result to VT.
     MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
     SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
                                             : ISD::SIGN_EXTEND_VECTOR_INREG,
                               DL, ResVT, Mul);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
   // MULU16/MULS16, both parts are needed.
   SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
   SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
                               OpsVT, NewN0, NewN1);
 
   // Repack the lower part and higher part result of mul into a wider
   // result. Make sure the type of mul result is VT.
   MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
   SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
   Res = DAG.getBitcast(ResVT, Res);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
                      DAG.getIntPtrConstant(0, DL));
 }
 
 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
                                  EVT VT, const SDLoc &DL) {
 
   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                                  DAG.getConstant(Mult, DL, VT));
     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
                          DAG.getConstant(Shift, DL, MVT::i8));
     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
                          N->getOperand(0));
     return Result;
   };
 
   auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                                  DAG.getConstant(Mul1, DL, VT));
     Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
                          DAG.getConstant(Mul2, DL, VT));
     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
                          N->getOperand(0));
     return Result;
   };
 
   switch (MulAmt) {
   default:
     break;
   case 11:
     // mul x, 11 => add ((shl (mul x, 5), 1), x)
     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
   case 21:
     // mul x, 21 => add ((shl (mul x, 5), 2), x)
     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
   case 41:
     // mul x, 41 => add ((shl (mul x, 5), 3), x)
     return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
   case 22:
     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
   case 19:
     // mul x, 19 => add ((shl (mul x, 9), 1), x)
     return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
   case 37:
     // mul x, 37 => add ((shl (mul x, 9), 2), x)
     return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
   case 73:
     // mul x, 73 => add ((shl (mul x, 9), 3), x)
     return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
   case 13:
     // mul x, 13 => add ((shl (mul x, 3), 2), x)
     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
   case 23:
     // mul x, 23 => sub ((shl (mul x, 3), 3), x)
     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
   case 26:
     // mul x, 26 => add ((mul (mul x, 5), 5), x)
     return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
   case 28:
     // mul x, 28 => add ((mul (mul x, 9), 3), x)
     return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
   case 29:
     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
                        combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
   }
 
   // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
   // by a single LEA.
   // First check if this a sum of two power of 2s because that's easy. Then
   // count how many zeros are up to the first bit.
   // TODO: We can do this even without LEA at a cost of two shifts and an add.
   if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
     unsigned ScaleShift = countTrailingZeros(MulAmt);
     if (ScaleShift >= 1 && ScaleShift < 4) {
       unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
       SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                                    DAG.getConstant(ShiftAmt, DL, MVT::i8));
       SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                                    DAG.getConstant(ScaleShift, DL, MVT::i8));
       return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
     }
   }
 
   return SDValue();
 }
 
 // If the upper 17 bits of each element are zero then we can use PMADDWD,
 // which is always at least as quick as PMULLD, except on KNL.
 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   if (Subtarget.isPMADDWDSlow())
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   // Only support vXi32 vectors.
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
     return SDValue();
 
   // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
   // Also allow v2i32 if it will be widened.
   MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
   if (!((ExperimentalVectorWideningLegalization && VT == MVT::v2i32) ||
         DAG.getTargetLoweringInfo().isTypeLegal(WVT)))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // If we are zero extending two steps without SSE4.1, its better to reduce
   // the vmul width instead.
   if (!Subtarget.hasSSE41() &&
       (N0.getOpcode() == ISD::ZERO_EXTEND &&
        N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
       (N1.getOpcode() == ISD::ZERO_EXTEND &&
        N1.getOperand(0).getScalarValueSizeInBits() <= 8))
     return SDValue();
 
   APInt Mask17 = APInt::getHighBitsSet(32, 17);
   if (!DAG.MaskedValueIsZero(N1, Mask17) ||
       !DAG.MaskedValueIsZero(N0, Mask17))
     return SDValue();
 
   // Use SplitOpsAndApply to handle AVX splitting.
   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                            ArrayRef<SDValue> Ops) {
     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
     return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
   };
   return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
                           { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
                           PMADDWDBuilder);
 }
 
 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   // Only support vXi64 vectors.
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
       VT.getVectorNumElements() < 2 ||
       !isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // MULDQ returns the 64-bit result of the signed multiplication of the lower
   // 32-bits. We can lower with this if the sign bits stretch that far.
   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
       DAG.ComputeNumSignBits(N1) > 32) {
     auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                             ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
     };
     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
                             PMULDQBuilder, /*CheckBWI*/false);
   }
 
   // If the upper bits are zero we can use a single pmuludq.
   APInt Mask = APInt::getHighBitsSet(64, 32);
   if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
     auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                              ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
     };
     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
                             PMULUDQBuilder, /*CheckBWI*/false);
   }
 
   return SDValue();
 }
 
 /// Optimize a single multiply with constant into two operations in order to
 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
 
   if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
     return V;
 
   if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
     return V;
 
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
   if (!MulConstantOptimization)
     return SDValue();
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction().optForMinSize())
     return SDValue();
 
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
   if (VT != MVT::i64 && VT != MVT::i32)
     return SDValue();
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!C)
     return SDValue();
   if (isPowerOf2_64(C->getZExtValue()))
     return SDValue();
 
   int64_t SignMulAmt = C->getSExtValue();
   assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
   uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
 
   SDLoc DL(N);
   if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
     SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                                  DAG.getConstant(AbsMulAmt, DL, VT));
     if (SignMulAmt < 0)
       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                            NewMul);
 
     return NewMul;
   }
 
   uint64_t MulAmt1 = 0;
   uint64_t MulAmt2 = 0;
   if ((AbsMulAmt % 9) == 0) {
     MulAmt1 = 9;
     MulAmt2 = AbsMulAmt / 9;
   } else if ((AbsMulAmt % 5) == 0) {
     MulAmt1 = 5;
     MulAmt2 = AbsMulAmt / 5;
   } else if ((AbsMulAmt % 3) == 0) {
     MulAmt1 = 3;
     MulAmt2 = AbsMulAmt / 3;
   }
 
   SDValue NewMul;
   // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
   if (MulAmt2 &&
       (isPowerOf2_64(MulAmt2) ||
        (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
 
     if (isPowerOf2_64(MulAmt2) &&
         !(SignMulAmt >= 0 && N->hasOneUse() &&
           N->use_begin()->getOpcode() == ISD::ADD))
       // If second multiplifer is pow2, issue it first. We want the multiply by
       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
       // is an add. Only do this for positive multiply amounts since the
       // negate would prevent it from being used as an address mode anyway.
       std::swap(MulAmt1, MulAmt2);
 
     if (isPowerOf2_64(MulAmt1))
       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                            DAG.getConstant(MulAmt1, DL, VT));
 
     if (isPowerOf2_64(MulAmt2))
       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                            DAG.getConstant(MulAmt2, DL, VT));
 
     // Negate the result.
     if (SignMulAmt < 0)
       NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                            NewMul);
   } else if (!Subtarget.slowLEA())
     NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
 
   if (!NewMul) {
     assert(C->getZExtValue() != 0 &&
            C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
            "Both cases that could cause potential overflows should have "
            "already been handled.");
     if (isPowerOf2_64(AbsMulAmt - 1)) {
       // (mul x, 2^N + 1) => (add (shl x, N), x)
       NewMul = DAG.getNode(
           ISD::ADD, DL, VT, N->getOperand(0),
           DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                       DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,
                                       MVT::i8)));
       // To negate, subtract the number from zero
       if (SignMulAmt < 0)
         NewMul = DAG.getNode(ISD::SUB, DL, VT,
                              DAG.getConstant(0, DL, VT), NewMul);
     } else if (isPowerOf2_64(AbsMulAmt + 1)) {
       // (mul x, 2^N - 1) => (sub (shl x, N), x)
       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                            DAG.getConstant(Log2_64(AbsMulAmt + 1),
                                            DL, MVT::i8));
       // To negate, reverse the operands of the subtract.
       if (SignMulAmt < 0)
         NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
       else
         NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {
       // (mul x, 2^N + 2) => (add (add (shl x, N), x), x)
       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                            DAG.getConstant(Log2_64(AbsMulAmt - 2),
                                            DL, MVT::i8));
       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
       NewMul = DAG.getNode(ISD::ADD, DL, VT, NewMul, N->getOperand(0));
     } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {
       // (mul x, 2^N - 2) => (sub (sub (shl x, N), x), x)
       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                            DAG.getConstant(Log2_64(AbsMulAmt + 2),
                                            DL, MVT::i8));
       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
       NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
     }
   }
 
   return NewMul;
 }
 
 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
   // since the result of setcc_c is all zero's or all ones.
   if (VT.isInteger() && !VT.isVector() &&
       N1C && N0.getOpcode() == ISD::AND &&
       N0.getOperand(1).getOpcode() == ISD::Constant) {
     SDValue N00 = N0.getOperand(0);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask <<= N1C->getAPIntValue();
     bool MaskOK = false;
     // We can handle cases concerning bit-widening nodes containing setcc_c if
     // we carefully interrogate the mask to make sure we are semantics
     // preserving.
     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
     // of the underlying setcc_c operation if the setcc_c was zero extended.
     // Consider the following example:
     //   zext(setcc_c)                 -> i32 0x0000FFFF
     //   c1                            -> i32 0x0000FFFF
     //   c2                            -> i32 0x00000001
     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
       MaskOK = true;
     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
       MaskOK = true;
     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
                 N00.getOpcode() == ISD::ANY_EXTEND) &&
                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
     }
     if (MaskOK && Mask != 0) {
       SDLoc DL(N);
       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
     }
   }
 
   // Hardware support for vector shifts is sparse which makes us scalarize the
   // vector operations in many cases. Also, on sandybridge ADD is faster than
   // shl.
   // (shl V, 1) -> add V,V
   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
       assert(N0.getValueType().isVector() && "Invalid vector shift type");
       // We shift all of the values by one. In many cases we do not have
       // hardware support for this operation. This is better expressed as an ADD
       // of two values.
       if (N1SplatC->getAPIntValue() == 1)
         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
     }
 
   return SDValue();
 }
 
 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   unsigned Size = VT.getSizeInBits();
 
   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
   // depending on sign of (SarConst - [56,48,32,24,16])
 
   // sexts in X86 are MOVs. The MOVs have the same code size
   // as above SHIFTs (only SHIFT on 1 has lower code size).
   // However the MOVs have 2 advantages to a SHIFT:
   // 1. MOVs can write to a register that differs from source
   // 2. MOVs accept memory operands
 
   if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
       N0.getOperand(1).getOpcode() != ISD::Constant)
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
   EVT CVT = N1.getValueType();
 
   if (SarConst.isNegative())
     return SDValue();
 
   for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
     unsigned ShiftSize = SVT.getSizeInBits();
     // skipping types without corresponding sext/zext and
     // ShlConst that is not one of [56,48,32,24,16]
     if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
       continue;
     SDLoc DL(N);
     SDValue NN =
         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
     SarConst = SarConst - (Size - ShiftSize);
     if (SarConst == 0)
       return NN;
     else if (SarConst.isNegative())
       return DAG.getNode(ISD::SHL, DL, VT, NN,
                          DAG.getConstant(-SarConst, DL, CVT));
     else
       return DAG.getNode(ISD::SRA, DL, VT, NN,
                          DAG.getConstant(SarConst, DL, CVT));
   }
   return SDValue();
 }
 
 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
 
   // Only do this on the last DAG combine as it can interfere with other
   // combines.
   if (!DCI.isAfterLegalizeDAG())
     return SDValue();
 
   // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
   // TODO: This is a generic DAG combine that became an x86-only combine to
   // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
   // and-not ('andn').
   if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
     return SDValue();
 
   auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
   auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   if (!ShiftC || !AndC)
     return SDValue();
 
   // If we can shrink the constant mask below 8-bits or 32-bits, then this
   // transform should reduce code size. It may also enable secondary transforms
   // from improved known-bits analysis or instruction selection.
   APInt MaskVal = AndC->getAPIntValue();
 
   // If this can be matched by a zero extend, don't optimize.
   if (MaskVal.isMask()) {
     unsigned TO = MaskVal.countTrailingOnes();
     if (TO >= 8 && isPowerOf2_32(TO))
       return SDValue();
   }
 
   APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
   unsigned OldMaskSize = MaskVal.getMinSignedBits();
   unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
   if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
       (OldMaskSize > 32 && NewMaskSize <= 32)) {
     // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
     SDLoc DL(N);
     SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
     SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
     return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
   }
   return SDValue();
 }
 
 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
   if (N->getOpcode() == ISD::SHL)
     if (SDValue V = combineShiftLeft(N, DAG))
       return V;
 
   if (N->getOpcode() == ISD::SRA)
     if (SDValue V = combineShiftRightArithmetic(N, DAG))
       return V;
 
   if (N->getOpcode() == ISD::SRL)
     if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
       return V;
 
   return SDValue();
 }
 
 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
          "Unexpected shift opcode");
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
          N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
          "Unexpected PACKSS/PACKUS input type");
 
   bool IsSigned = (X86ISD::PACKSS == Opcode);
 
   // Constant Folding.
   APInt UndefElts0, UndefElts1;
   SmallVector<APInt, 32> EltBits0, EltBits1;
   if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
       (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
     unsigned NumLanes = VT.getSizeInBits() / 128;
     unsigned NumDstElts = VT.getVectorNumElements();
     unsigned NumSrcElts = NumDstElts / 2;
     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
 
     APInt Undefs(NumDstElts, 0);
     SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
     for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
       for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
         unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
         auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
         auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
 
         if (UndefElts[SrcIdx]) {
           Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
           continue;
         }
 
         APInt &Val = EltBits[SrcIdx];
         if (IsSigned) {
           // PACKSS: Truncate signed value with signed saturation.
           // Source values less than dst minint are saturated to minint.
           // Source values greater than dst maxint are saturated to maxint.
           if (Val.isSignedIntN(DstBitsPerElt))
             Val = Val.trunc(DstBitsPerElt);
           else if (Val.isNegative())
             Val = APInt::getSignedMinValue(DstBitsPerElt);
           else
             Val = APInt::getSignedMaxValue(DstBitsPerElt);
         } else {
           // PACKUS: Truncate signed value with unsigned saturation.
           // Source values less than zero are saturated to zero.
           // Source values greater than dst maxuint are saturated to maxuint.
           if (Val.isIntN(DstBitsPerElt))
             Val = Val.trunc(DstBitsPerElt);
           else if (Val.isNegative())
             Val = APInt::getNullValue(DstBitsPerElt);
           else
             Val = APInt::getAllOnesValue(DstBitsPerElt);
         }
         Bits[Lane * NumDstEltsPerLane + Elt] = Val;
       }
     }
 
     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
   // truncate to create a larger truncate.
   if (Subtarget.hasAVX512() &&
       N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
       N0.getOperand(0).getValueType() == MVT::v8i32) {
     if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
         (!IsSigned &&
          DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
       if (Subtarget.hasVLX())
         return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
 
       // Widen input to v16i32 so we can truncate that.
       SDLoc dl(N);
       SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
                                    N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
     }
   }
 
   // Attempt to combine as shuffle.
   SDValue Op(N, 0);
   if (SDValue Res =
           combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
                                         /*HasVarMask*/ false,
                                         /*AllowVarMask*/ true, DAG, Subtarget))
     return Res;
 
   return SDValue();
 }
 
 static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget &Subtarget) {
   assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
           X86ISD::VSRL == N->getOpcode()) &&
          "Unexpected shift opcode");
   EVT VT = N->getValueType(0);
 
   // Shift zero -> zero.
   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   APInt KnownUndef, KnownZero;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
                                      KnownZero, DCI))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
           X86ISD::VSRLI == Opcode) &&
          "Unexpected shift opcode");
   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
          "Unexpected value type");
   assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
 
   // Out of range logical bit shifts are guaranteed to be zero.
   // Out of range arithmetic bit shifts splat the sign bit.
   unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
   if (ShiftVal >= NumBitsPerElt) {
     if (LogicalShift)
       return DAG.getConstant(0, SDLoc(N), VT);
     else
       ShiftVal = NumBitsPerElt - 1;
   }
 
   // Shift N0 by zero -> N0.
   if (!ShiftVal)
     return N0;
 
   // Shift zero -> zero.
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
   // clamped to (NumBitsPerElt - 1).
   if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
     unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
     unsigned NewShiftVal = ShiftVal + ShiftVal2;
     if (NewShiftVal >= NumBitsPerElt)
       NewShiftVal = NumBitsPerElt - 1;
     return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
                        DAG.getConstant(NewShiftVal, SDLoc(N), MVT::i8));
   }
 
   // We can decode 'whole byte' logical bit shifts as shuffles.
   if (LogicalShift && (ShiftVal % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
       return Res;
   }
 
   // Constant Folding.
   APInt UndefElts;
   SmallVector<APInt, 32> EltBits;
   if (N->isOnlyUserOf(N0.getNode()) &&
       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
     assert(EltBits.size() == VT.getVectorNumElements() &&
            "Unexpected shift value type");
     for (APInt &Elt : EltBits) {
       if (X86ISD::VSHLI == Opcode)
         Elt <<= ShiftVal;
       else if (X86ISD::VSRAI == Opcode)
         Elt.ashrInPlace(ShiftVal);
       else
         Elt.lshrInPlace(ShiftVal);
     }
     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.SimplifyDemandedBits(SDValue(N, 0),
                                APInt::getAllOnesValue(NumBitsPerElt), DCI))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget &Subtarget) {
   assert(
       ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
        (N->getOpcode() == X86ISD::PINSRW &&
         N->getValueType(0) == MVT::v8i16)) &&
       "Unexpected vector insertion");
 
   // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
   SDValue Op(N, 0);
   if (SDValue Res =
           combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
                                         /*HasVarMask*/ false,
                                         /*AllowVarMask*/ true, DAG, Subtarget))
     return Res;
 
   return SDValue();
 }
 
 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
 /// OR -> CMPNEQSS.
 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget &Subtarget) {
   unsigned opcode;
 
   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   // we're requiring SSE2 for both.
   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDValue CMP0 = N0->getOperand(1);
     SDValue CMP1 = N1->getOperand(1);
     SDLoc DL(N);
 
     // The SETCCs should both refer to the same CMP.
     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
       return SDValue();
 
     SDValue CMP00 = CMP0->getOperand(0);
     SDValue CMP01 = CMP0->getOperand(1);
     EVT     VT    = CMP00.getValueType();
 
     if (VT == MVT::f32 || VT == MVT::f64) {
       bool ExpectingFlags = false;
       // Check for any users that want flags:
       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
            !ExpectingFlags && UI != UE; ++UI)
         switch (UI->getOpcode()) {
         default:
         case ISD::BR_CC:
         case ISD::BRCOND:
         case ISD::SELECT:
           ExpectingFlags = true;
           break;
         case ISD::CopyToReg:
         case ISD::SIGN_EXTEND:
         case ISD::ZERO_EXTEND:
         case ISD::ANY_EXTEND:
           break;
         }
 
       if (!ExpectingFlags) {
         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
 
         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
           X86::CondCode tmp = cc0;
           cc0 = cc1;
           cc1 = tmp;
         }
 
         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
           // FIXME: need symbolic constants for these magic numbers.
           // See X86ATTInstPrinter.cpp:printSSECC().
           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
           if (Subtarget.hasAVX512()) {
             SDValue FSetCC =
                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
                             DAG.getConstant(x86cc, DL, MVT::i8));
             // Need to fill with zeros to ensure the bitcast will produce zeroes
             // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
             SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
                                       DAG.getConstant(0, DL, MVT::v16i1),
                                       FSetCC, DAG.getIntPtrConstant(0, DL));
             return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
                                       N->getSimpleValueType(0));
           }
           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
                                               CMP00.getValueType(), CMP00, CMP01,
                                               DAG.getConstant(x86cc, DL,
                                                               MVT::i8));
 
           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
 
           if (is64BitFP && !Subtarget.is64Bit()) {
             // On a 32-bit target, we cannot bitcast the 64-bit float to a
             // 64-bit integer, since that's not a legal type. Since
             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
             // bits, but can do this little dance to extract the lowest 32 bits
             // and work with those going forward.
             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
                                            OnesOrZeroesF);
             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
                                         Vector32, DAG.getIntPtrConstant(0, DL));
             IntVT = MVT::i32;
           }
 
           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
                                       DAG.getConstant(1, DL, IntVT));
           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
                                               ANDed);
           return OneBitOfTruth;
         }
       }
     }
   }
   return SDValue();
 }
 
 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == ISD::AND);
 
   MVT VT = N->getSimpleValueType(0);
   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
     return SDValue();
 
   SDValue X, Y;
   SDValue N0 = peekThroughBitcasts(N->getOperand(0));
   SDValue N1 = peekThroughBitcasts(N->getOperand(1));
   if (N0.getOpcode() == ISD::XOR &&
       ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
     X = N0.getOperand(0);
     Y = N1;
   } else if (N1.getOpcode() == ISD::XOR &&
              ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
     X = N1.getOperand(0);
     Y = N0;
   } else
     return SDValue();
 
   X = DAG.getBitcast(VT, X);
   Y = DAG.getBitcast(VT, Y);
   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
 }
 
 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
 // register. In most cases we actually compare or select YMM-sized registers
 // and mixing the two types creates horrible code. This method optimizes
 // some of the transition sequences.
 // Even with AVX-512 this is still useful for removing casts around logical
 // operations on vXi1 mask types.
 static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   assert(VT.isVector() && "Expected vector type");
 
   assert((N->getOpcode() == ISD::ANY_EXTEND ||
           N->getOpcode() == ISD::ZERO_EXTEND ||
           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
 
   SDValue Narrow = N->getOperand(0);
   EVT NarrowVT = Narrow.getValueType();
 
   if (Narrow->getOpcode() != ISD::XOR &&
       Narrow->getOpcode() != ISD::AND &&
       Narrow->getOpcode() != ISD::OR)
     return SDValue();
 
   SDValue N0  = Narrow->getOperand(0);
   SDValue N1  = Narrow->getOperand(1);
   SDLoc DL(Narrow);
 
   // The Left side has to be a trunc.
   if (N0.getOpcode() != ISD::TRUNCATE)
     return SDValue();
 
   // The type of the truncated inputs.
   if (N0->getOperand(0).getValueType() != VT)
     return SDValue();
 
   // The right side has to be a 'trunc' or a constant vector.
   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
                   N1.getOperand(0).getValueType() == VT;
   if (!RHSTrunc &&
       !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
     return SDValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
     return SDValue();
 
   // Set N0 and N1 to hold the inputs to the new wide operation.
   N0 = N0->getOperand(0);
   if (RHSTrunc)
     N1 = N1->getOperand(0);
   else
     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
 
   // Generate the wide operation.
   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
   unsigned Opcode = N->getOpcode();
   switch (Opcode) {
   default: llvm_unreachable("Unexpected opcode");
   case ISD::ANY_EXTEND:
     return Op;
   case ISD::ZERO_EXTEND:
     return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
   case ISD::SIGN_EXTEND:
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
                        Op, DAG.getValueType(NarrowVT));
   }
 }
 
 /// If both input operands of a logic op are being cast from floating point
 /// types, try to convert this into a floating point logic node to avoid
 /// unnecessary moves from SSE to integer registers.
 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   unsigned FPOpcode = ISD::DELETED_NODE;
   if (N->getOpcode() == ISD::AND)
     FPOpcode = X86ISD::FAND;
   else if (N->getOpcode() == ISD::OR)
     FPOpcode = X86ISD::FOR;
   else if (N->getOpcode() == ISD::XOR)
     FPOpcode = X86ISD::FXOR;
 
   assert(FPOpcode != ISD::DELETED_NODE &&
          "Unexpected input node for FP logic conversion");
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
        (Subtarget.hasSSE2() && VT == MVT::i64))) {
     SDValue N00 = N0.getOperand(0);
     SDValue N10 = N1.getOperand(0);
     EVT N00Type = N00.getValueType();
     EVT N10Type = N10.getValueType();
     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
       return DAG.getBitcast(VT, FPLogic);
     }
   }
   return SDValue();
 }
 
 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
 /// with a shift-right to eliminate loading the vector constant mask value.
 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
   EVT VT0 = Op0.getValueType();
   EVT VT1 = Op1.getValueType();
 
   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
     return SDValue();
 
   APInt SplatVal;
   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
       !SplatVal.isMask())
     return SDValue();
 
   // Don't prevent creation of ANDN.
   if (isBitwiseNot(Op0))
     return SDValue();
 
   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
     return SDValue();
 
   unsigned EltBitWidth = VT0.getScalarSizeInBits();
   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
     return SDValue();
 
   SDLoc DL(N);
   unsigned ShiftVal = SplatVal.countTrailingOnes();
   SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
 
 // Get the index node from the lowered DAG of a GEP IR instruction with one
 // indexing dimension.
 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
   if (Ld->isIndexed())
     return SDValue();
 
   SDValue Base = Ld->getBasePtr();
 
   if (Base.getOpcode() != ISD::ADD)
     return SDValue();
 
   SDValue ShiftedIndex = Base.getOperand(0);
 
   if (ShiftedIndex.getOpcode() != ISD::SHL)
     return SDValue();
 
   return ShiftedIndex.getOperand(0);
 
 }
 
 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
   if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
     switch (VT.getSizeInBits()) {
     default: return false;
     case 64: return Subtarget.is64Bit() ? true : false;
     case 32: return true;
     }
   }
   return false;
 }
 
 // This function recognizes cases where X86 bzhi instruction can replace and
 // 'and-load' sequence.
 // In case of loading integer value from an array of constants which is defined
 // as follows:
 //
 //   int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
 //
 // then applying a bitwise and on the result with another input.
 // It's equivalent to performing bzhi (zero high bits) on the input, with the
 // same index of the load.
 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   MVT VT = Node->getSimpleValueType(0);
   SDLoc dl(Node);
 
   // Check if subtarget has BZHI instruction for the node's type
   if (!hasBZHI(Subtarget, VT))
     return SDValue();
 
   // Try matching the pattern for both operands.
   for (unsigned i = 0; i < 2; i++) {
     SDValue N = Node->getOperand(i);
     LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
 
      // continue if the operand is not a load instruction
     if (!Ld)
       return SDValue();
 
     const Value *MemOp = Ld->getMemOperand()->getValue();
 
     if (!MemOp)
       return SDValue();
 
     if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
       if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
         if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
 
           Constant *Init = GV->getInitializer();
           Type *Ty = Init->getType();
           if (!isa<ConstantDataArray>(Init) ||
               !Ty->getArrayElementType()->isIntegerTy() ||
               Ty->getArrayElementType()->getScalarSizeInBits() !=
                   VT.getSizeInBits() ||
               Ty->getArrayNumElements() >
                   Ty->getArrayElementType()->getScalarSizeInBits())
             continue;
 
           // Check if the array's constant elements are suitable to our case.
           uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
           bool ConstantsMatch = true;
           for (uint64_t j = 0; j < ArrayElementCount; j++) {
             ConstantInt *Elem =
                 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
             if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
               ConstantsMatch = false;
               break;
             }
           }
           if (!ConstantsMatch)
             continue;
 
           // Do the transformation (For 32-bit type):
           // -> (and (load arr[idx]), inp)
           // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
           //    that will be replaced with one bzhi instruction.
           SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
           SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
 
           // Get the Node which indexes into the array.
           SDValue Index = getIndexFromUnindexedLoad(Ld);
           if (!Index)
             return SDValue();
           Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
 
           SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
           Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
 
           SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
           SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
 
           return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
         }
       }
     }
   }
   return SDValue();
 }
 
 // Look for (and (ctpop X), 1) which is the IR form of __builtin_parity.
 // Turn it into series of XORs and a setnp.
 static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
                              const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
 
   // We only support 64-bit and 32-bit. 64-bit requires special handling
   // unless the 64-bit popcnt instruction is legal.
   if (VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.isTypeLegal(VT) && TLI.isOperationLegal(ISD::CTPOP, VT))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // LHS needs to be a single use CTPOP.
   if (N0.getOpcode() != ISD::CTPOP || !N0.hasOneUse())
     return SDValue();
 
   // RHS needs to be 1.
   if (!isOneConstant(N1))
     return SDValue();
 
   SDLoc DL(N);
   SDValue X = N0.getOperand(0);
 
   // If this is 64-bit, its always best to xor the two 32-bit pieces together
   // even if we have popcnt.
   if (VT == MVT::i64) {
     SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
                              DAG.getNode(ISD::SRL, DL, VT, X,
                                          DAG.getConstant(32, DL, MVT::i8)));
     SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
     X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
     // Generate a 32-bit parity idiom. This will bring us back here if we need
     // to expand it too.
     SDValue Parity = DAG.getNode(ISD::AND, DL, MVT::i32,
                                  DAG.getNode(ISD::CTPOP, DL, MVT::i32, X),
                                  DAG.getConstant(1, DL, MVT::i32));
     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Parity);
   }
   assert(VT == MVT::i32 && "Unexpected VT!");
 
   // Xor the high and low 16-bits together using a 32-bit operation.
   SDValue Hi16 = DAG.getNode(ISD::SRL, DL, VT, X,
                              DAG.getConstant(16, DL, MVT::i8));
   X = DAG.getNode(ISD::XOR, DL, VT, X, Hi16);
 
   // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
   // This should allow an h-reg to be used to save a shift.
   // FIXME: We only get an h-reg in 32-bit mode.
   SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
                            DAG.getNode(ISD::SRL, DL, VT, X,
                                        DAG.getConstant(8, DL, MVT::i8)));
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
   SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
   SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
 
   // Copy the inverse of the parity flag into a register with setcc.
   SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
   // Zero extend to original type.
   return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
 }
 
 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
 
   // If this is SSE1 only convert to FAND to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
     return DAG.getBitcast(
         MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
   }
 
   // Use a 32-bit and+zext if upper bits known zero.
   if (VT == MVT::i64 && Subtarget.is64Bit() &&
       !isa<ConstantSDNode>(N->getOperand(1))) {
     APInt HiMask = APInt::getHighBitsSet(64, 32);
     if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
         DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
       SDLoc dl(N);
       SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
       SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
                          DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
     }
   }
 
   // This must be done before legalization has expanded the ctpop.
   if (SDValue V = combineParity(N, DAG, Subtarget))
     return V;
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
     return R;
 
   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
     return ShiftRight;
 
   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
     return R;
 
   // Attempt to recursively combine a bitmask AND with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
       return Res;
   }
 
   // Attempt to combine a scalar bitmask AND with an extracted shuffle.
   if ((VT.getScalarSizeInBits() % 8) == 0 &&
       N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
     SDValue BitMask = N->getOperand(1);
     SDValue SrcVec = N->getOperand(0).getOperand(0);
     EVT SrcVecVT = SrcVec.getValueType();
 
     // Check that the constant bitmask masks whole bytes.
     APInt UndefElts;
     SmallVector<APInt, 64> EltBits;
     if (VT == SrcVecVT.getScalarType() &&
         N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
         getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
         llvm::all_of(EltBits, [](APInt M) {
           return M.isNullValue() || M.isAllOnesValue();
         })) {
       unsigned NumElts = SrcVecVT.getVectorNumElements();
       unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
       unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
 
       // Create a root shuffle mask from the byte mask and the extracted index.
       SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
       for (unsigned i = 0; i != Scale; ++i) {
         if (UndefElts[i])
           continue;
         int VecIdx = Scale * Idx + i;
         ShuffleMask[VecIdx] =
             EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
       }
 
       if (SDValue Shuffle = combineX86ShufflesRecursively(
               {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
               /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
                            N->getOperand(0).getOperand(1));
     }
   }
 
   return SDValue();
 }
 
 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
   if (N->getOpcode() != ISD::OR)
     return false;
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // Canonicalize AND to LHS.
   if (N1.getOpcode() == ISD::AND)
     std::swap(N0, N1);
 
   // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
     return false;
 
   Mask = N1.getOperand(0);
   X = N1.getOperand(1);
 
   // Check to see if the mask appeared in both the AND and ANDNP.
   if (N0.getOperand(0) == Mask)
     Y = N0.getOperand(1);
   else if (N0.getOperand(1) == Mask)
     Y = N0.getOperand(0);
   else
     return false;
 
   // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
   // ANDNP combine allows other combines to happen that prevent matching.
   return true;
 }
 
 // Try to fold:
 //   (or (and (m, y), (pandn m, x)))
 // into:
 //   (vselect m, x, y)
 // As a special case, try to fold:
 //   (or (and (m, (sub 0, x)), (pandn m, x)))
 // into:
 //   (sub (xor X, M), M)
 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
 
   EVT VT = N->getValueType(0);
   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
         (VT.is256BitVector() && Subtarget.hasInt256())))
     return SDValue();
 
   SDValue X, Y, Mask;
   if (!matchLogicBlend(N, X, Y, Mask))
     return SDValue();
 
   // Validate that X, Y, and Mask are bitcasts, and see through them.
   Mask = peekThroughBitcasts(Mask);
   X = peekThroughBitcasts(X);
   Y = peekThroughBitcasts(Y);
 
   EVT MaskVT = Mask.getValueType();
   unsigned EltBits = MaskVT.getScalarSizeInBits();
 
   // TODO: Attempt to handle floating point cases as well?
   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
     return SDValue();
 
   SDLoc DL(N);
 
   // Try to match:
   //   (or (and (M, (sub 0, X)), (pandn M, X)))
   // which is a special case of vselect:
   //   (vselect M, (sub 0, X), X)
   // Per:
   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
   // We know that, if fNegate is 0 or 1:
   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
   //
   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
   // This lets us transform our vselect to:
   //   (add (xor X, M), (and M, 1))
   // And further to:
   //   (sub (xor X, M), M)
   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
       DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
     auto IsNegV = [](SDNode *N, SDValue V) {
       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
     };
     SDValue V;
     if (IsNegV(Y.getNode(), X))
       V = X;
     else if (IsNegV(X.getNode(), Y))
       V = Y;
 
     if (V) {
       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
       SDValue SubOp2 = Mask;
 
       // If the negate was on the false side of the select, then
       // the operands of the SUB need to be swapped. PR 27251.
       // This is because the pattern being matched above is
       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
       // but if the pattern matched was
       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
       // pattern also needs to be a negation of the replacement pattern above.
       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
       // sub accomplishes the negation of the replacement pattern.
       if (V == Y)
          std::swap(SubOp1, SubOp2);
 
       SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
       return DAG.getBitcast(VT, Res);
     }
   }
 
   // PBLENDVB is only available on SSE 4.1.
   if (!Subtarget.hasSSE41())
     return SDValue();
 
   MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
 
   X = DAG.getBitcast(BlendVT, X);
   Y = DAG.getBitcast(BlendVT, Y);
   Mask = DAG.getBitcast(BlendVT, Mask);
   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
   return DAG.getBitcast(VT, Mask);
 }
 
 // Helper function for combineOrCmpEqZeroToCtlzSrl
 // Transforms:
 //   seteq(cmp x, 0)
 //   into:
 //   srl(ctlz x), log2(bitsize(x))
 // Input pattern is checked by caller.
 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
                                           SelectionDAG &DAG) {
   SDValue Cmp = Op.getOperand(1);
   EVT VT = Cmp.getOperand(0).getValueType();
   unsigned Log2b = Log2_32(VT.getSizeInBits());
   SDLoc dl(Op);
   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
   // The result of the shift is true or false, and on X86, the 32-bit
   // encoding of shr and lzcnt is more desirable.
   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
                             DAG.getConstant(Log2b, dl, MVT::i8));
   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
 }
 
 // Try to transform:
 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
 //   into:
 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
 // Will also attempt to match more generic cases, eg:
 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
 // Only applies if the target supports the FastLZCNT feature.
 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
     return SDValue();
 
   auto isORCandidate = [](SDValue N) {
     return (N->getOpcode() == ISD::OR && N->hasOneUse());
   };
 
   // Check the zero extend is extending to 32-bit or more. The code generated by
   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
   // instructions to clear the upper bits.
   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
       !isORCandidate(N->getOperand(0)))
     return SDValue();
 
   // Check the node matches: setcc(eq, cmp 0)
   auto isSetCCCandidate = [](SDValue N) {
     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
            N->getOperand(1).getOpcode() == X86ISD::CMP &&
            isNullConstant(N->getOperand(1).getOperand(1)) &&
            N->getOperand(1).getValueType().bitsGE(MVT::i32);
   };
 
   SDNode *OR = N->getOperand(0).getNode();
   SDValue LHS = OR->getOperand(0);
   SDValue RHS = OR->getOperand(1);
 
   // Save nodes matching or(or, setcc(eq, cmp 0)).
   SmallVector<SDNode *, 2> ORNodes;
   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
     ORNodes.push_back(OR);
     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
     LHS = OR->getOperand(0);
     RHS = OR->getOperand(1);
   }
 
   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
       !isORCandidate(SDValue(OR, 0)))
     return SDValue();
 
   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
   // to
   // or(srl(ctlz),srl(ctlz)).
   // The dag combiner can then fold it into:
   // srl(or(ctlz, ctlz)).
   EVT VT = OR->getValueType(0);
   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
   SDValue Ret, NewRHS;
   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
 
   if (!Ret)
     return SDValue();
 
   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
   while (ORNodes.size() > 0) {
     OR = ORNodes.pop_back_val();
     LHS = OR->getOperand(0);
     RHS = OR->getOperand(1);
     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
     if (RHS->getOpcode() == ISD::OR)
       std::swap(LHS, RHS);
     EVT VT = OR->getValueType(0);
     SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
     if (!NewRHS)
       return SDValue();
     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
   }
 
   if (Ret)
     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 
   return Ret;
 }
 
 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
                          const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   // If this is SSE1 only convert to FOR to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
     return DAG.getBitcast(MVT::v4i32,
                           DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
                                       DAG.getBitcast(MVT::v4f32, N0),
                                       DAG.getBitcast(MVT::v4f32, N1)));
   }
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
     return R;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
     return R;
 
   // Attempt to recursively combine an OR of shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
       return Res;
   }
 
   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
   unsigned Bits = VT.getScalarSizeInBits();
 
   // SHLD/SHRD instructions have lower register pressure, but on some
   // platforms they have higher latency than the equivalent
   // series of shifts/or that would otherwise be generated.
   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
   // have higher latencies and we are not optimizing for size.
   if (!OptForSize && Subtarget.isSHLDSlow())
     return SDValue();
 
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
     return SDValue();
   if (!N0.hasOneUse() || !N1.hasOneUse())
     return SDValue();
 
   SDValue ShAmt0 = N0.getOperand(1);
   if (ShAmt0.getValueType() != MVT::i8)
     return SDValue();
   SDValue ShAmt1 = N1.getOperand(1);
   if (ShAmt1.getValueType() != MVT::i8)
     return SDValue();
 
   // Peek through any modulo shift masks.
   SDValue ShMsk0;
   if (ShAmt0.getOpcode() == ISD::AND &&
       isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
       ShAmt0.getConstantOperandVal(1) == (Bits - 1)) {
     ShMsk0 = ShAmt0;
     ShAmt0 = ShAmt0.getOperand(0);
   }
   SDValue ShMsk1;
   if (ShAmt1.getOpcode() == ISD::AND &&
       isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
       ShAmt1.getConstantOperandVal(1) == (Bits - 1)) {
     ShMsk1 = ShAmt1;
     ShAmt1 = ShAmt1.getOperand(0);
   }
 
   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
     ShAmt0 = ShAmt0.getOperand(0);
   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
     ShAmt1 = ShAmt1.getOperand(0);
 
   SDLoc DL(N);
   unsigned Opc = X86ISD::SHLD;
   SDValue Op0 = N0.getOperand(0);
   SDValue Op1 = N1.getOperand(0);
   if (ShAmt0.getOpcode() == ISD::SUB ||
       ShAmt0.getOpcode() == ISD::XOR) {
     Opc = X86ISD::SHRD;
     std::swap(Op0, Op1);
     std::swap(ShAmt0, ShAmt1);
     std::swap(ShMsk0, ShMsk1);
   }
 
   // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
   // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
   // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
   // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
   // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> SHLD( X, Y, C )
   // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> SHRD( X, Y, C )
   if (ShAmt1.getOpcode() == ISD::SUB) {
     SDValue Sum = ShAmt1.getOperand(0);
     if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
       if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
       if ((SumC->getAPIntValue() == Bits ||
            (SumC->getAPIntValue() == 0 && ShMsk1)) &&
           ShAmt1Op1 == ShAmt0)
         return DAG.getNode(Opc, DL, VT, Op0, Op1,
                            DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
     }
   } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
     auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
     if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
       return DAG.getNode(Opc, DL, VT,
                          N0.getOperand(0), N1.getOperand(0),
                          DAG.getNode(ISD::TRUNCATE, DL,
                                        MVT::i8, ShAmt0));
   } else if (ShAmt1.getOpcode() == ISD::XOR) {
     SDValue Mask = ShAmt1.getOperand(1);
     if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
       unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
       SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
       if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
         ShAmt1Op0 = ShAmt1Op0.getOperand(0);
       if (MaskC->getSExtValue() == (Bits - 1) &&
           (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
         if (Op1.getOpcode() == InnerShift &&
             isa<ConstantSDNode>(Op1.getOperand(1)) &&
             Op1.getConstantOperandVal(1) == 1) {
           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
                              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
         }
         // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
         if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
             Op1.getOperand(0) == Op1.getOperand(1)) {
           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
                              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
         }
       }
     }
   }
 
   return SDValue();
 }
 
 /// Try to turn tests against the signbit in the form of:
 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
 /// into:
 ///   SETGT(X, -1)
 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
   // This is only worth doing if the output type is i8 or i1.
   EVT ResultType = N->getValueType(0);
   if (ResultType != MVT::i8 && ResultType != MVT::i1)
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // We should be performing an xor against a truncated shift.
   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
     return SDValue();
 
   // Make sure we are performing an xor against one.
   if (!isOneConstant(N1))
     return SDValue();
 
   // SetCC on x86 zero extends so only act on this if it's a logical shift.
   SDValue Shift = N0.getOperand(0);
   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
     return SDValue();
 
   // Make sure we are truncating from one of i16, i32 or i64.
   EVT ShiftTy = Shift.getValueType();
   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
     return SDValue();
 
   // Make sure the shift amount extracts the sign bit.
   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
     return SDValue();
 
   // Create a greater-than comparison against -1.
   // N.B. Using SETGE against 0 works but we want a canonical looking
   // comparison, using SETGT matches up with what TranslateX86CC.
   SDLoc DL(N);
   SDValue ShiftOp = Shift.getOperand(0);
   EVT ShiftOpTy = ShiftOp.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
                                                *DAG.getContext(), ResultType);
   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
   if (SetCCResultType != ResultType)
     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
   return Cond;
 }
 
 /// Turn vector tests of the signbit in the form of:
 ///   xor (sra X, elt_size(X)-1), -1
 /// into:
 ///   pcmpgt X, -1
 ///
 /// This should be called before type legalization because the pattern may not
 /// persist after that.
 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   if (!VT.isSimple())
     return SDValue();
 
   switch (VT.getSimpleVT().SimpleTy) {
   default: return SDValue();
   case MVT::v16i8:
   case MVT::v8i16:
   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
   case MVT::v32i8:
   case MVT::v16i16:
   case MVT::v8i32:
   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
   }
 
   // There must be a shift right algebraic before the xor, and the xor must be a
   // 'not' operation.
   SDValue Shift = N->getOperand(0);
   SDValue Ones = N->getOperand(1);
   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
       !ISD::isBuildVectorAllOnes(Ones.getNode()))
     return SDValue();
 
   // The shift should be smearing the sign bit across each vector element.
   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
   if (!ShiftBV)
     return SDValue();
 
   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
     return SDValue();
 
   // Create a greater-than comparison against -1. We don't use the more obvious
   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
 }
 
 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
 /// is valid for the given \p Subtarget.
 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
                                         const X86Subtarget &Subtarget) {
   if (!Subtarget.hasAVX512())
     return false;
 
   // FIXME: Scalar type may be supported if we move it to vector register.
   if (!SrcVT.isVector())
     return false;
 
   EVT SrcElVT = SrcVT.getScalarType();
   EVT DstElVT = DstVT.getScalarType();
   if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
     return false;
   if (SrcVT.is512BitVector() || Subtarget.hasVLX())
     return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
   return false;
 }
 
 /// Detect patterns of truncation with unsigned saturation:
 ///
 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
 ///   Return the source value x to be truncated or SDValue() if the pattern was
 ///   not matched.
 ///
 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
 ///   where C1 >= 0 and C2 is unsigned max of destination type.
 ///
 ///    (truncate (smax (smin (x, C2), C1)) to dest_type)
 ///   where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
 ///
 ///   These two patterns are equivalent to:
 ///   (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
 ///   So return the smax(x, C1) value to be truncated or SDValue() if the
 ///   pattern was not matched.
 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
                                  const SDLoc &DL) {
   EVT InVT = In.getValueType();
 
   // Saturation with truncation. We truncate from InVT to VT.
   assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
          "Unexpected types for truncate operation");
 
   // Match min/max and return limit value as a parameter.
   auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
     if (V.getOpcode() == Opcode &&
         ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
       return V.getOperand(0);
     return SDValue();
   };
 
   APInt C1, C2;
   if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
     // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
     // the element size of the destination type.
     if (C2.isMask(VT.getScalarSizeInBits()))
       return UMin;
 
   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
     if (MatchMinMax(SMin, ISD::SMAX, C1))
       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
         return SMin;
 
   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
       if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
           C2.uge(C1)) {
         return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
       }
 
   return SDValue();
 }
 
 /// Detect patterns of truncation with signed saturation:
 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
 ///                  signed_max_of_dest_type)) to dest_type)
 /// or:
 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
 ///                  signed_min_of_dest_type)) to dest_type).
 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
 /// Return the source value to be truncated or SDValue() if the pattern was not
 /// matched.
 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
   unsigned NumDstBits = VT.getScalarSizeInBits();
   unsigned NumSrcBits = In.getScalarValueSizeInBits();
   assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
 
   auto MatchMinMax = [](SDValue V, unsigned Opcode,
                         const APInt &Limit) -> SDValue {
     APInt C;
     if (V.getOpcode() == Opcode &&
         ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
       return V.getOperand(0);
     return SDValue();
   };
 
   APInt SignedMax, SignedMin;
   if (MatchPackUS) {
     SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
     SignedMin = APInt(NumSrcBits, 0);
   } else {
     SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
     SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
   }
 
   if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
     if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
       return SMax;
 
   if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
     if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
       return SMin;
 
   return SDValue();
 }
 
 /// Detect a pattern of truncation with signed saturation.
 /// The types should allow to use VPMOVSS* instruction on AVX512.
 /// Return the source value to be truncated or SDValue() if the pattern was not
 /// matched.
 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
                                        const X86Subtarget &Subtarget,
                                        const TargetLowering &TLI) {
   if (!TLI.isTypeLegal(In.getValueType()))
     return SDValue();
   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
     return SDValue();
   return detectSSatPattern(In, VT);
 }
 
 /// Detect a pattern of truncation with saturation:
 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
 /// The types should allow to use VPMOVUS* instruction on AVX512.
 /// Return the source value to be truncated or SDValue() if the pattern was not
 /// matched.
 static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
                                        const SDLoc &DL,
                                        const X86Subtarget &Subtarget,
                                        const TargetLowering &TLI) {
   if (!TLI.isTypeLegal(In.getValueType()))
     return SDValue();
   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
     return SDValue();
   return detectUSatPattern(In, VT, DAG, DL);
 }
 
 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
                                       SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   EVT SVT = VT.getScalarType();
   EVT InVT = In.getValueType();
   EVT InSVT = InVT.getScalarType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
       isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
     if (auto SSatVal = detectSSatPattern(In, VT))
       return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
     if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
   }
   if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
       !Subtarget.hasAVX512() &&
       (SVT == MVT::i8 || SVT == MVT::i16) &&
       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
     if (auto USatVal = detectSSatPattern(In, VT, true)) {
       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
       if (SVT == MVT::i8 && InSVT == MVT::i32) {
         EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                      VT.getVectorNumElements());
         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
                                              DAG, Subtarget);
         if (Mid)
           return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
                                         Subtarget);
       } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
                                       Subtarget);
     }
     if (auto SSatVal = detectSSatPattern(In, VT))
       return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
                                     Subtarget);
   }
   return SDValue();
 }
 
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
 /// X86ISD::AVG instruction.
 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
                                 const X86Subtarget &Subtarget,
                                 const SDLoc &DL) {
   if (!VT.isVector())
     return SDValue();
   EVT InVT = In.getValueType();
   unsigned NumElems = VT.getVectorNumElements();
 
   EVT ScalarVT = VT.getVectorElementType();
   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
         NumElems >= 2 && isPowerOf2_32(NumElems)))
     return SDValue();
 
   // InScalarVT is the intermediate type in AVG pattern and it should be greater
   // than the original input type (i8/i16).
   EVT InScalarVT = InVT.getVectorElementType();
   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
     return SDValue();
 
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   // Detect the following pattern:
   //
   //   %1 = zext <N x i8> %a to <N x i32>
   //   %2 = zext <N x i8> %b to <N x i32>
   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
   //   %4 = add nuw nsw <N x i32> %3, %2
   //   %5 = lshr <N x i32> %N, <i32 1 x N>
   //   %6 = trunc <N x i32> %5 to <N x i8>
   //
   // In AVX512, the last instruction can also be a trunc store.
   if (In.getOpcode() != ISD::SRL)
     return SDValue();
 
   // A lambda checking the given SDValue is a constant vector and each element
   // is in the range [Min, Max].
   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
     if (!BV || !BV->isConstant())
       return false;
     for (SDValue Op : V->ops()) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
       if (!C)
         return false;
       const APInt &Val = C->getAPIntValue();
       if (Val.ult(Min) || Val.ugt(Max))
         return false;
     }
     return true;
   };
 
   // Check if each element of the vector is left-shifted by one.
   auto LHS = In.getOperand(0);
   auto RHS = In.getOperand(1);
   if (!IsConstVectorInRange(RHS, 1, 1))
     return SDValue();
   if (LHS.getOpcode() != ISD::ADD)
     return SDValue();
 
   // Detect a pattern of a + b + 1 where the order doesn't matter.
   SDValue Operands[3];
   Operands[0] = LHS.getOperand(0);
   Operands[1] = LHS.getOperand(1);
 
   auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                        ArrayRef<SDValue> Ops) {
     return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
   };
 
   // Take care of the case when one of the operands is a constant vector whose
   // element is in the range [1, 256].
   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
       Operands[0].getOperand(0).getValueType() == VT) {
     // The pattern is detected. Subtract one from the constant vector, then
     // demote it and emit X86ISD::AVG instruction.
     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
     return SplitOpsAndApply(DAG, Subtarget, DL, VT,
                             { Operands[0].getOperand(0), Operands[1] },
                             AVGBuilder);
   }
 
   if (Operands[0].getOpcode() == ISD::ADD)
     std::swap(Operands[0], Operands[1]);
   else if (Operands[1].getOpcode() != ISD::ADD)
     return SDValue();
   Operands[2] = Operands[1].getOperand(0);
   Operands[1] = Operands[1].getOperand(1);
 
   // Now we have three operands of two additions. Check that one of them is a
   // constant vector with ones, and the other two are promoted from i8/i16.
   for (int i = 0; i < 3; ++i) {
     if (!IsConstVectorInRange(Operands[i], 1, 1))
       continue;
     std::swap(Operands[i], Operands[2]);
 
     // Check if Operands[0] and Operands[1] are results of type promotion.
     for (int j = 0; j < 2; ++j)
       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
           Operands[j].getOperand(0).getValueType() != VT)
         return SDValue();
 
     // The pattern is detected, emit X86ISD::AVG instruction(s).
     return SplitOpsAndApply(DAG, Subtarget, DL, VT,
                             { Operands[0].getOperand(0),
                               Operands[1].getOperand(0) }, AVGBuilder);
   }
 
   return SDValue();
 }
 
 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   LoadSDNode *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
   SDLoc dl(Ld);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
   // into two 16-byte operations. Also split non-temporal aligned loads on
   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
   ISD::LoadExtType Ext = Ld->getExtensionType();
   bool Fast;
   unsigned AddressSpace = Ld->getAddressSpace();
   unsigned Alignment = Ld->getAlignment();
   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
       Ext == ISD::NON_EXTLOAD &&
       ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
                                AddressSpace, Alignment, &Fast) && !Fast))) {
     unsigned NumElems = RegVT.getVectorNumElements();
     if (NumElems < 2)
       return SDValue();
 
     SDValue Ptr = Ld->getBasePtr();
 
     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   NumElems/2);
     SDValue Load1 =
         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
                     Alignment, Ld->getMemOperand()->getFlags());
 
     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
     SDValue Load2 =
         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
                     Ld->getPointerInfo().getWithOffset(16),
                     MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                              Load1.getValue(1),
                              Load2.getValue(1));
 
     SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
     return DCI.CombineTo(N, NewVec, TF, true);
   }
 
   return SDValue();
 }
 
 /// If V is a build vector of boolean constants and exactly one of those
 /// constants is true, return the operand index of that true element.
 /// Otherwise, return -1.
 static int getOneTrueElt(SDValue V) {
   // This needs to be a build vector of booleans.
   // TODO: Checking for the i1 type matches the IR definition for the mask,
   // but the mask check could be loosened to i8 or other types. That might
   // also require checking more than 'allOnesValue'; eg, the x86 HW
   // instructions only require that the MSB is set for each mask element.
   // The ISD::MSTORE comments/definition do not specify how the mask operand
   // is formatted.
   auto *BV = dyn_cast<BuildVectorSDNode>(V);
   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
     return -1;
 
   int TrueIndex = -1;
   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
   for (unsigned i = 0; i < NumElts; ++i) {
     const SDValue &Op = BV->getOperand(i);
     if (Op.isUndef())
       continue;
     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
     if (!ConstNode)
       return -1;
     if (ConstNode->getAPIntValue().isAllOnesValue()) {
       // If we already found a one, this is too many.
       if (TrueIndex >= 0)
         return -1;
       TrueIndex = i;
     }
   }
   return TrueIndex;
 }
 
 /// Given a masked memory load/store operation, return true if it has one mask
 /// bit set. If it has one mask bit set, then also return the memory address of
 /// the scalar element to load/store, the vector index to insert/extract that
 /// scalar element, and the alignment for the scalar memory access.
 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
                                          SelectionDAG &DAG, SDValue &Addr,
                                          SDValue &Index, unsigned &Alignment) {
   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
   if (TrueMaskElt < 0)
     return false;
 
   // Get the address of the one scalar element that is specified by the mask
   // using the appropriate offset from the base pointer.
   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
   Addr = MaskedOp->getBasePtr();
   if (TrueMaskElt != 0) {
     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
   }
 
   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
   return true;
 }
 
 /// If exactly one element of the mask is set for a non-extending masked load,
 /// it is a scalar load and vector insert.
 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
 /// mask have already been optimized in IR, so we don't bother with those here.
 static SDValue
 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   // However, some target hooks may need to be added to know when the transform
   // is profitable. Endianness would also have to be considered.
 
   SDValue Addr, VecIndex;
   unsigned Alignment;
   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
     return SDValue();
 
   // Load the one scalar element that is specified by the mask using the
   // appropriate offset from the base pointer.
   SDLoc DL(ML);
   EVT VT = ML->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
   SDValue Load =
       DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
                   Alignment, ML->getMemOperand()->getFlags());
 
   // Insert the loaded element into the appropriate place in the vector.
   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
                                ML->getPassThru(), Load, VecIndex);
   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
 }
 
 static SDValue
 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI) {
   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
     return SDValue();
 
   SDLoc DL(ML);
   EVT VT = ML->getValueType(0);
 
   // If we are loading the first and last elements of a vector, it is safe and
   // always faster to load the whole vector. Replace the masked load with a
   // vector load and select.
   unsigned NumElts = VT.getVectorNumElements();
   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
   if (LoadFirstElt && LoadLastElt) {
     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
                                 ML->getMemOperand());
     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
                                   ML->getPassThru());
     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
   }
 
   // Convert a masked load with a constant mask into a masked load and a select.
   // This allows the select operation to use a faster kind of select instruction
   // (for example, vblendvps -> vblendps).
 
   // Don't try this if the pass-through operand is already undefined. That would
   // cause an infinite loop because that's what we're about to create.
   if (ML->getPassThru().isUndef())
     return SDValue();
 
   // The new masked load has an undef pass-through operand. The select uses the
   // original pass-through operand.
   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
                                     ML->getMask(), DAG.getUNDEF(VT),
                                     ML->getMemoryVT(), ML->getMemOperand(),
                                     ML->getExtensionType());
   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
                                 ML->getPassThru());
 
   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
 }
 
 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
 
   // TODO: Expanding load with constant mask may be optimized as well.
   if (Mld->isExpandingLoad())
     return SDValue();
 
   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
       return ScalarLoad;
     // TODO: Do some AVX512 subsets benefit from this transform?
     if (!Subtarget.hasAVX512())
       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
         return Blend;
   }
 
   if (Mld->getExtensionType() != ISD::SEXTLOAD)
     return SDValue();
 
   // Resolve extending loads.
   EVT VT = Mld->getValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   EVT LdVT = Mld->getMemoryVT();
   SDLoc dl(Mld);
 
   assert(LdVT != VT && "Cannot extend to the same type");
   unsigned ToSz = VT.getScalarSizeInBits();
   unsigned FromSz = LdVT.getScalarSizeInBits();
   // From/To sizes and ElemCount must be pow of two.
   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
     "Unexpected size for extending masked load");
 
   unsigned SizeRatio  = ToSz / FromSz;
   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
 
   // Create a type on which we perform the shuffle.
   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
           LdVT.getScalarType(), NumElems*SizeRatio);
   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
   // Convert PassThru value.
   SDValue WidePassThru = DAG.getBitcast(WideVecVT, Mld->getPassThru());
   if (!Mld->getPassThru().isUndef()) {
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
            "WideVecVT should be legal");
     WidePassThru = DAG.getVectorShuffle(WideVecVT, dl, WidePassThru,
                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
   }
 
   // Prepare the new mask.
   SDValue NewMask;
   SDValue Mask = Mld->getMask();
   if (Mask.getValueType() == VT) {
     // Mask and original value have the same type.
     NewMask = DAG.getBitcast(WideVecVT, Mask);
     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
       ShuffleVec[i] = NumElems * SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
                                    ShuffleVec);
   } else {
     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
     unsigned WidenNumElts = NumElems*SizeRatio;
     unsigned MaskNumElts = VT.getVectorNumElements();
     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
                                      WidenNumElts);
 
     unsigned NumConcat = WidenNumElts / MaskNumElts;
     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
     SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
     Ops[0] = Mask;
     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   }
 
   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
                                      Mld->getBasePtr(), NewMask, WidePassThru,
                                      Mld->getMemoryVT(), Mld->getMemOperand(),
                                      ISD::NON_EXTLOAD);
   SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
 
 /// If exactly one element of the mask is set for a non-truncating masked store,
 /// it is a vector extract and scalar store.
 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
 /// mask have already been optimized in IR, so we don't bother with those here.
 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
                                               SelectionDAG &DAG) {
   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   // However, some target hooks may need to be added to know when the transform
   // is profitable. Endianness would also have to be considered.
 
   SDValue Addr, VecIndex;
   unsigned Alignment;
   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
     return SDValue();
 
   // Extract the one scalar element that is actually being stored.
   SDLoc DL(MS);
   EVT VT = MS->getValue().getValueType();
   EVT EltVT = VT.getVectorElementType();
   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
                                 MS->getValue(), VecIndex);
 
   // Store that element at the appropriate offset from the base pointer.
   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
                       Alignment, MS->getMemOperand()->getFlags());
 }
 
 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget &Subtarget) {
   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
   if (Mst->isCompressingStore())
     return SDValue();
 
   EVT VT = Mst->getValue().getValueType();
   if (!Mst->isTruncatingStore()) {
     if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
       return ScalarStore;
 
     // If the mask value has been legalized to a non-boolean vector, try to
     // simplify ops leading up to it. We only demand the MSB of each lane.
     SDValue Mask = Mst->getMask();
     if (Mask.getScalarValueSizeInBits() != 1) {
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
       if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
         return SDValue(N, 0);
     }
 
     // TODO: AVX512 targets should also be able to simplify something like the
     // pattern above, but that pattern will be different. It will either need to
     // match setcc more generally or match PCMPGTM later (in tablegen?).
 
     return SDValue();
   }
 
   // Resolve truncating stores.
   unsigned NumElems = VT.getVectorNumElements();
   EVT StVT = Mst->getMemoryVT();
   SDLoc dl(Mst);
 
   assert(StVT != VT && "Cannot truncate to the same type");
   unsigned FromSz = VT.getScalarSizeInBits();
   unsigned ToSz = StVT.getScalarSizeInBits();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // The truncating store is legal in some cases. For example
   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
   // are designated for truncate store.
   // In this case we don't need any further transformations.
   if (TLI.isTruncStoreLegal(VT, StVT))
     return SDValue();
 
   // From/To sizes and ElemCount must be pow of two.
   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
     "Unexpected size for truncating masked store");
   // We are going to use the original vector elt for storing.
   // Accumulated smaller vector elements must be a multiple of the store size.
   assert (((NumElems * FromSz) % ToSz) == 0 &&
           "Unexpected ratio for truncating masked store");
 
   unsigned SizeRatio  = FromSz / ToSz;
   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
 
   // Create a type on which we perform the shuffle.
   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
           StVT.getScalarType(), NumElems*SizeRatio);
 
   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
   for (unsigned i = 0; i != NumElems; ++i)
     ShuffleVec[i] = i * SizeRatio;
 
   // Can't shuffle using an illegal type.
   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
          "WideVecVT should be legal");
 
   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                               DAG.getUNDEF(WideVecVT),
                                               ShuffleVec);
 
   SDValue NewMask;
   SDValue Mask = Mst->getMask();
   if (Mask.getValueType() == VT) {
     // Mask and original value have the same type.
     NewMask = DAG.getBitcast(WideVecVT, Mask);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
       ShuffleVec[i] = NumElems*SizeRatio;
     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
                                    DAG.getConstant(0, dl, WideVecVT),
                                    ShuffleVec);
   } else {
     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
     unsigned WidenNumElts = NumElems*SizeRatio;
     unsigned MaskNumElts = VT.getVectorNumElements();
     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
                                      WidenNumElts);
 
     unsigned NumConcat = WidenNumElts / MaskNumElts;
     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
     SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
     Ops[0] = Mask;
     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
   }
 
   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
                             Mst->getBasePtr(), NewMask, StVT,
                             Mst->getMemOperand(), false);
 }
 
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   StoreSDNode *St = cast<StoreSDNode>(N);
   EVT VT = St->getValue().getValueType();
   EVT StVT = St->getMemoryVT();
   SDLoc dl(St);
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Convert a store of vXi1 into a store of iX and a bitcast.
   if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
       VT.getVectorElementType() == MVT::i1) {
 
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
     StoredVal = DAG.getBitcast(NewVT, StoredVal);
 
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                         St->getPointerInfo(), St->getAlignment(),
                         St->getMemOperand()->getFlags());
   }
 
   // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
   // This will avoid a copy to k-register.
   if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
       StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
       StoredVal.getOperand(0).getValueType() == MVT::i8) {
     return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
                         St->getBasePtr(), St->getPointerInfo(),
                         St->getAlignment(), St->getMemOperand()->getFlags());
   }
 
   // Widen v2i1/v4i1 stores to v8i1.
   if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
       Subtarget.hasAVX512()) {
     unsigned NumConcats = 8 / VT.getVectorNumElements();
     SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
     Ops[0] = StoredVal;
     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                         St->getPointerInfo(), St->getAlignment(),
                         St->getMemOperand()->getFlags());
   }
 
   // Turn vXi1 stores of constants into a scalar store.
   if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
     // If its a v64i1 store without 64-bit support, we need two stores.
     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
                                       StoredVal->ops().slice(0, 32));
       Lo = combinevXi1ConstantToInteger(Lo, DAG);
       SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
                                       StoredVal->ops().slice(32, 32));
       Hi = combinevXi1ConstantToInteger(Hi, DAG);
 
       unsigned Alignment = St->getAlignment();
 
       SDValue Ptr0 = St->getBasePtr();
       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
 
       SDValue Ch0 =
           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
                        Alignment, St->getMemOperand()->getFlags());
       SDValue Ch1 =
           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
                        St->getPointerInfo().getWithOffset(4),
                        MinAlign(Alignment, 4U),
                        St->getMemOperand()->getFlags());
       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
     }
 
     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                         St->getPointerInfo(), St->getAlignment(),
                         St->getMemOperand()->getFlags());
   }
 
   // If we are saving a concatenation of two XMM registers and 32-byte stores
   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
   bool Fast;
   unsigned AddressSpace = St->getAddressSpace();
   unsigned Alignment = St->getAlignment();
   if (VT.is256BitVector() && StVT == VT &&
       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
                              AddressSpace, Alignment, &Fast) &&
       !Fast) {
     unsigned NumElems = VT.getVectorNumElements();
     if (NumElems < 2)
       return SDValue();
 
     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
 
     SDValue Ptr0 = St->getBasePtr();
     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
 
     SDValue Ch0 =
         DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
                      Alignment, St->getMemOperand()->getFlags());
     SDValue Ch1 =
         DAG.getStore(St->getChain(), dl, Value1, Ptr1,
                      St->getPointerInfo().getWithOffset(16),
                      MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
   }
 
   // Optimize trunc store (of multiple scalars) to shuffle and store.
   // First, pack all of the elements in one place. Next, store to memory
   // in fewer chunks.
   if (St->isTruncatingStore() && VT.isVector()) {
     // Check if we can detect an AVG pattern from the truncation. If yes,
     // replace the trunc store by a normal store with the result of X86ISD::AVG
     // instruction.
     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
                                        Subtarget, dl))
       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
                           St->getPointerInfo(), St->getAlignment(),
                           St->getMemOperand()->getFlags());
 
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     if (SDValue Val =
         detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
                                 TLI))
       return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
                              dl, Val, St->getBasePtr(),
                              St->getMemoryVT(), St->getMemOperand(), DAG);
     if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
                                               DAG, dl, Subtarget, TLI))
       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
                              dl, Val, St->getBasePtr(),
                              St->getMemoryVT(), St->getMemOperand(), DAG);
 
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
     unsigned FromSz = VT.getScalarSizeInBits();
     unsigned ToSz = StVT.getScalarSizeInBits();
 
     // The truncating store is legal in some cases. For example
     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
     // are designated for truncate store.
     // In this case we don't need any further transformations.
     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
       return SDValue();
 
     // From, To sizes and ElemCount must be pow of two
     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
     // We are going to use the original vector elt for storing.
     // Accumulated smaller vector elements must be a multiple of the store size.
     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
 
     unsigned SizeRatio  = FromSz / ToSz;
 
     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
 
     // Create a type on which we perform the shuffle
     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
             StVT.getScalarType(), NumElems*SizeRatio);
 
     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i != NumElems; ++i)
       ShuffleVec[i] = i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     if (!TLI.isTypeLegal(WideVecVT))
       return SDValue();
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
                                          DAG.getUNDEF(WideVecVT),
                                          ShuffleVec);
     // At this point all of the data is stored at the bottom of the
     // register. We now need to save it to mem.
 
     // Find the largest store unit
     MVT StoreType = MVT::i8;
     for (MVT Tp : MVT::integer_valuetypes()) {
       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
         StoreType = Tp;
     }
 
     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
         (64 <= NumElems * ToSz))
       StoreType = MVT::f64;
 
     // Bitcast the original vector into a vector of store-size units
     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
     SDValue Ptr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                                    StoreType, ShuffWide,
                                    DAG.getIntPtrConstant(i, dl));
       SDValue Ch =
           DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
                        St->getAlignment(), St->getMemOperand()->getFlags());
       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
       Chains.push_back(Ch);
     }
 
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   }
 
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
   // places to insert EMMS.  This qualifies as a quick hack.
 
   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
   if (VT.getSizeInBits() != 64)
     return SDValue();
 
   const Function &F = DAG.getMachineFunction().getFunction();
   bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
   bool F64IsLegal =
       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
   if ((VT.isVector() ||
        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
       isa<LoadSDNode>(St->getValue()) &&
       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
       St->getChain().hasOneUse() && !St->isVolatile()) {
     LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
     SmallVector<SDValue, 8> Ops;
 
     if (!ISD::isNormalLoad(Ld))
       return SDValue();
 
     // If this is not the MMX case, i.e. we are just turning i64 load/store
     // into f64 load/store, avoid the transformation if there are multiple
     // uses of the loaded value.
     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
       return SDValue();
 
     SDLoc LdDL(Ld);
     SDLoc StDL(N);
     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
     if (Subtarget.is64Bit() || F64IsLegal) {
       MVT LdVT = (Subtarget.is64Bit() &&
                   (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getMemOperand());
 
       // Make sure new load is placed in same chain order.
       DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
       return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
                           St->getMemOperand());
     }
 
     // Otherwise, lower to two pairs of 32-bit loads / stores.
     SDValue LoAddr = Ld->getBasePtr();
     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
 
     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
                                Ld->getPointerInfo(), Ld->getAlignment(),
                                Ld->getMemOperand()->getFlags());
     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
                                Ld->getPointerInfo().getWithOffset(4),
                                MinAlign(Ld->getAlignment(), 4),
                                Ld->getMemOperand()->getFlags());
     // Make sure new loads are placed in same chain order.
     DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
     DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
 
     LoAddr = St->getBasePtr();
     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
 
     SDValue LoSt =
         DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
                      St->getAlignment(), St->getMemOperand()->getFlags());
     SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
                                 St->getPointerInfo().getWithOffset(4),
                                 MinAlign(St->getAlignment(), 4),
                                 St->getMemOperand()->getFlags());
     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
   }
 
   // This is similar to the above case, but here we handle a scalar 64-bit
   // integer store that is extracted from a vector on a 32-bit target.
   // If we have SSE2, then we can treat it like a floating-point double
   // to get past legalization. The execution dependencies fixup pass will
   // choose the optimal machine instruction for the store if this really is
   // an integer or v2f32 rather than an f64.
   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     SDValue OldExtract = St->getOperand(1);
     SDValue ExtOp0 = OldExtract.getOperand(0);
     unsigned VecSize = ExtOp0.getValueSizeInBits();
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                      BitCast, OldExtract.getOperand(1));
     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
                         St->getPointerInfo(), St->getAlignment(),
                         St->getMemOperand()->getFlags());
   }
 
   return SDValue();
 }
 
 /// Return 'true' if this vector operation is "horizontal"
 /// and return the operands for the horizontal operation in LHS and RHS.  A
 /// horizontal operation performs the binary operation on successive elements
 /// of its first operand, then on successive elements of its second operand,
 /// returning the resulting values in a vector.  For example, if
 ///   A = < float a0, float a1, float a2, float a3 >
 /// and
 ///   B = < float b0, float b1, float b2, float b3 >
 /// then the result of doing a horizontal operation on A and B is
 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
 /// A horizontal-op B, for some already available A and B, and if so then LHS is
 /// set to A, RHS to B, and the routine returns 'true'.
 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   // If either operand is undef, bail out. The binop should be simplified.
   if (LHS.isUndef() || RHS.isUndef())
     return false;
 
   // Look for the following pattern:
   //   A = < float a0, float a1, float a2, float a3 >
   //   B = < float b0, float b1, float b2, float b3 >
   // and
   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
   // which is A horizontal-op B.
 
   // At least one of the operands should be a vector shuffle.
   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
     return false;
 
   MVT VT = LHS.getSimpleValueType();
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for horizontal add/sub");
 
   // View LHS in the form
   //   LHS = VECTOR_SHUFFLE A, B, LMask
   // If LHS is not a shuffle, then pretend it is the identity shuffle:
   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
   // NOTE: A default initialized SDValue represents an UNDEF of type VT.
   unsigned NumElts = VT.getVectorNumElements();
   SDValue A, B;
   SmallVector<int, 16> LMask(NumElts);
   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
     if (!LHS.getOperand(0).isUndef())
       A = LHS.getOperand(0);
     if (!LHS.getOperand(1).isUndef())
       B = LHS.getOperand(1);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
     llvm::copy(Mask, LMask.begin());
   } else {
     A = LHS;
     for (unsigned i = 0; i != NumElts; ++i)
       LMask[i] = i;
   }
 
   // Likewise, view RHS in the form
   //   RHS = VECTOR_SHUFFLE C, D, RMask
   SDValue C, D;
   SmallVector<int, 16> RMask(NumElts);
   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
     if (!RHS.getOperand(0).isUndef())
       C = RHS.getOperand(0);
     if (!RHS.getOperand(1).isUndef())
       D = RHS.getOperand(1);
     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
     llvm::copy(Mask, RMask.begin());
   } else {
     C = RHS;
     for (unsigned i = 0; i != NumElts; ++i)
       RMask[i] = i;
   }
 
   // If A and B occur in reverse order in RHS, then canonicalize by commuting
   // RHS operands and shuffle mask.
   if (A != C) {
     std::swap(C, D);
     ShuffleVectorSDNode::commuteMask(RMask);
   }
   // Check that the shuffles are both shuffling the same vectors.
   if (!(A == C && B == D))
     return false;
 
   // LHS and RHS are now:
   //   LHS = shuffle A, B, LMask
   //   RHS = shuffle A, B, RMask
   // Check that the masks correspond to performing a horizontal operation.
   // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
   // so we just repeat the inner loop if this is a 256-bit op.
   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
   assert((NumEltsPer128BitChunk % 2 == 0) &&
          "Vector type should have an even number of elements in each lane");
   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
     for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
       // Ignore undefined components.
       int LIdx = LMask[i + j], RIdx = RMask[i + j];
       if (LIdx < 0 || RIdx < 0 ||
           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
         continue;
 
       // The  low half of the 128-bit result must choose from A.
       // The high half of the 128-bit result must choose from B,
       // unless B is undef. In that case, we are always choosing from A.
       unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
       unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;
 
       // Check that successive elements are being operated on. If not, this is
       // not a horizontal operation.
       int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
       if (!(LIdx == Index && RIdx == Index + 1) &&
           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
         return false;
     }
   }
 
   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
   return true;
 }
 
 /// Do target-specific dag combines on floating-point adds/subs.
 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   bool IsFadd = N->getOpcode() == ISD::FADD;
   auto HorizOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
 
   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, IsFadd) &&
       shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget))
     return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
 
   return SDValue();
 }
 
 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
 /// the codegen.
 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
 /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
 ///       anything that is guaranteed to be transformed by DAGCombiner.
 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget,
                                           const SDLoc &DL) {
   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
   SDValue Src = N->getOperand(0);
   unsigned Opcode = Src.getOpcode();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   EVT VT = N->getValueType(0);
   EVT SrcVT = Src.getValueType();
 
   auto IsFreeTruncation = [VT](SDValue Op) {
     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
 
     // See if this has been extended from a smaller/equal size to
     // the truncation size, allowing a truncation to combine with the extend.
     unsigned Opcode = Op.getOpcode();
     if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
          Opcode == ISD::ZERO_EXTEND) &&
         Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
       return true;
 
     // See if this is a single use constant which can be constant folded.
-    SDValue BC = peekThroughOneUseBitcasts(Op);
-    return ISD::isBuildVectorOfConstantSDNodes(BC.getNode());
+    // NOTE: We don't peek throught bitcasts here because there is currently
+    // no support for constant folding truncate+bitcast+vector_of_constants. So
+    // we'll just send up with a truncate on both operands which will
+    // get turned back into (truncate (binop)) causing an infinite loop.
+    return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
   };
 
   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
     return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
   };
 
   // Don't combine if the operation has other uses.
   if (!Src.hasOneUse())
     return SDValue();
 
   // Only support vector truncation for now.
   // TODO: i64 scalar math would benefit as well.
   if (!VT.isVector())
     return SDValue();
 
   // In most cases its only worth pre-truncating if we're only facing the cost
   // of one truncation.
   // i.e. if one of the inputs will constant fold or the input is repeated.
   switch (Opcode) {
   case ISD::AND:
   case ISD::XOR:
   case ISD::OR: {
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
         (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
 
   case ISD::MUL:
     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
     // better to truncate if we have the chance.
     if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
         !TLI.isOperationLegal(Opcode, SrcVT))
       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
     LLVM_FALLTHROUGH;
   case ISD::ADD: {
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegal(Opcode, VT) &&
         (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
   case ISD::SUB: {
     // TODO: ISD::SUB We are conservative and require both sides to be freely
     // truncatable to avoid interfering with combineSubToSubus.
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegal(Opcode, VT) &&
         (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
   }
 
   return SDValue();
 }
 
 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
                                                  const X86Subtarget &Subtarget,
                                                  SelectionDAG &DAG) {
   SDValue In = N->getOperand(0);
   EVT InVT = In.getValueType();
   EVT InSVT = InVT.getVectorElementType();
   EVT OutVT = N->getValueType(0);
   EVT OutSVT = OutVT.getVectorElementType();
 
   // Split a long vector into vectors of legal type and mask to unset all bits
   // that won't appear in the result to prevent saturation.
   // TODO - we should be doing this at the maximum legal size but this is
   // causing regressions where we're concatenating back to max width just to
   // perform the AND and then extracting back again.....
   unsigned NumSubRegs = InVT.getSizeInBits() / 128;
   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
   SmallVector<SDValue, 8> SubVecs(NumSubRegs);
 
   APInt Mask =
       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
   SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
 
   for (unsigned i = 0; i < NumSubRegs; i++) {
     SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
                               DAG.getIntPtrConstant(i * NumSubRegElts, DL));
     SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
   }
   In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
 
   return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
 }
 
 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
                                                  const X86Subtarget &Subtarget,
                                                  SelectionDAG &DAG) {
   SDValue In = N->getOperand(0);
   EVT InVT = In.getValueType();
   EVT OutVT = N->getValueType(0);
   In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
                    DAG.getValueType(OutVT));
   return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
 }
 
 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
 /// legalization the truncation will be translated into a BUILD_VECTOR with each
 /// element that is extracted from a vector and then truncated, and it is
 /// difficult to do this optimization based on them.
 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget) {
   EVT OutVT = N->getValueType(0);
   if (!OutVT.isVector())
     return SDValue();
 
   SDValue In = N->getOperand(0);
   if (!In.getValueType().isSimple())
     return SDValue();
 
   EVT InVT = In.getValueType();
   unsigned NumElems = OutVT.getVectorNumElements();
 
   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
   // SSE2, and we need to take care of it specially.
   // AVX512 provides vpmovdb.
   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
     return SDValue();
 
   EVT OutSVT = OutVT.getVectorElementType();
   EVT InSVT = InVT.getVectorElementType();
   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
         NumElems >= 8))
     return SDValue();
 
   // SSSE3's pshufb results in less instructions in the cases below.
   if (Subtarget.hasSSSE3() && NumElems == 8 &&
       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
     return SDValue();
 
   SDLoc DL(N);
   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
   // truncate 2 x v4i32 to v8i16.
   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
     return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
   if (InSVT == MVT::i32)
     return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
 
   return SDValue();
 }
 
 /// This function transforms vector truncation of 'extended sign-bits' or
 /// 'extended zero-bits' values.
 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
                                                SelectionDAG &DAG,
                                                const X86Subtarget &Subtarget) {
   // Requires SSE2 but AVX512 has fast truncate.
   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
     return SDValue();
 
   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
     return SDValue();
 
   SDValue In = N->getOperand(0);
   if (!In.getValueType().isSimple())
     return SDValue();
 
   MVT VT = N->getValueType(0).getSimpleVT();
   MVT SVT = VT.getScalarType();
 
   MVT InVT = In.getValueType().getSimpleVT();
   MVT InSVT = InVT.getScalarType();
 
   // Check we have a truncation suited for PACKSS/PACKUS.
   if (!VT.is128BitVector() && !VT.is256BitVector())
     return SDValue();
   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
     return SDValue();
   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
     return SDValue();
 
   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
   unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Use PACKUS if the input has zero-bits that extend all the way to the
   // packed/truncated value. e.g. masks, zext_in_reg, etc.
   KnownBits Known = DAG.computeKnownBits(In);
   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
   if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
 
   // Use PACKSS if the input has sign-bits that extend all the way to the
   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
   if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
 
   return SDValue();
 }
 
 // Try to form a MULHU or MULHS node by looking for
 // (trunc (srl (mul ext, ext), 16))
 // TODO: This is X86 specific because we want to be able to handle wide types
 // before type legalization. But we can only do it if the vector will be
 // legalized via widening/splitting. Type legalization can't handle promotion
 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
 // combiner.
 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
   // First instruction should be a right shift of a multiply.
   if (Src.getOpcode() != ISD::SRL ||
       Src.getOperand(0).getOpcode() != ISD::MUL)
     return SDValue();
 
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   // Only handle vXi16 types that are at least 128-bits unless they will be
   // widened.
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
       (!ExperimentalVectorWideningLegalization &&
        VT.getVectorNumElements() < 8))
     return SDValue();
 
   // Input type should be vXi32.
   EVT InVT = Src.getValueType();
   if (InVT.getVectorElementType() != MVT::i32)
     return SDValue();
 
   // Need a shift by 16.
   APInt ShiftAmt;
   if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
       ShiftAmt != 16)
     return SDValue();
 
   SDValue LHS = Src.getOperand(0).getOperand(0);
   SDValue RHS = Src.getOperand(0).getOperand(1);
 
   unsigned ExtOpc = LHS.getOpcode();
   if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
       RHS.getOpcode() != ExtOpc)
     return SDValue();
 
   // Peek through the extends.
   LHS = LHS.getOperand(0);
   RHS = RHS.getOperand(0);
 
   // Ensure the input types match.
   if (LHS.getValueType() != VT || RHS.getValueType() != VT)
     return SDValue();
 
   unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
   return DAG.getNode(Opc, DL, VT, LHS, RHS);
 }
 
 // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
 // from one vector with signed bytes from another vector, adds together
 // adjacent pairs of 16-bit products, and saturates the result before
 // truncating to 16-bits.
 //
 // Which looks something like this:
 // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
 //                 (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
 static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget,
                                const SDLoc &DL) {
   if (!VT.isVector() || !Subtarget.hasSSSE3())
     return SDValue();
 
   unsigned NumElems = VT.getVectorNumElements();
   EVT ScalarVT = VT.getVectorElementType();
   if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
     return SDValue();
 
   SDValue SSatVal = detectSSatPattern(In, VT);
   if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
     return SDValue();
 
   // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
   // of multiplies from even/odd elements.
   SDValue N0 = SSatVal.getOperand(0);
   SDValue N1 = SSatVal.getOperand(1);
 
   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
   SDValue N10 = N1.getOperand(0);
   SDValue N11 = N1.getOperand(1);
 
   // TODO: Handle constant vectors and use knownbits/computenumsignbits?
   // Canonicalize zero_extend to LHS.
   if (N01.getOpcode() == ISD::ZERO_EXTEND)
     std::swap(N00, N01);
   if (N11.getOpcode() == ISD::ZERO_EXTEND)
     std::swap(N10, N11);
 
   // Ensure we have a zero_extend and a sign_extend.
   if (N00.getOpcode() != ISD::ZERO_EXTEND ||
       N01.getOpcode() != ISD::SIGN_EXTEND ||
       N10.getOpcode() != ISD::ZERO_EXTEND ||
       N11.getOpcode() != ISD::SIGN_EXTEND)
     return SDValue();
 
   // Peek through the extends.
   N00 = N00.getOperand(0);
   N01 = N01.getOperand(0);
   N10 = N10.getOperand(0);
   N11 = N11.getOperand(0);
 
   // Ensure the extend is from vXi8.
   if (N00.getValueType().getVectorElementType() != MVT::i8 ||
       N01.getValueType().getVectorElementType() != MVT::i8 ||
       N10.getValueType().getVectorElementType() != MVT::i8 ||
       N11.getValueType().getVectorElementType() != MVT::i8)
     return SDValue();
 
   // All inputs should be build_vectors.
   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
       N01.getOpcode() != ISD::BUILD_VECTOR ||
       N10.getOpcode() != ISD::BUILD_VECTOR ||
       N11.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
   // N00/N10 are zero extended. N01/N11 are sign extended.
 
   // For each element, we need to ensure we have an odd element from one vector
   // multiplied by the odd element of another vector and the even element from
   // one of the same vectors being multiplied by the even element from the
   // other vector. So we need to make sure for each element i, this operator
   // is being performed:
   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
   SDValue ZExtIn, SExtIn;
   for (unsigned i = 0; i != NumElems; ++i) {
     SDValue N00Elt = N00.getOperand(i);
     SDValue N01Elt = N01.getOperand(i);
     SDValue N10Elt = N10.getOperand(i);
     SDValue N11Elt = N11.getOperand(i);
     // TODO: Be more tolerant to undefs.
     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
       return SDValue();
     unsigned IdxN00 = ConstN00Elt->getZExtValue();
     unsigned IdxN01 = ConstN01Elt->getZExtValue();
     unsigned IdxN10 = ConstN10Elt->getZExtValue();
     unsigned IdxN11 = ConstN11Elt->getZExtValue();
     // Add is commutative so indices can be reordered.
     if (IdxN00 > IdxN10) {
       std::swap(IdxN00, IdxN10);
       std::swap(IdxN01, IdxN11);
     }
     // N0 indices be the even element. N1 indices must be the next odd element.
     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
       return SDValue();
     SDValue N00In = N00Elt.getOperand(0);
     SDValue N01In = N01Elt.getOperand(0);
     SDValue N10In = N10Elt.getOperand(0);
     SDValue N11In = N11Elt.getOperand(0);
     // First time we find an input capture it.
     if (!ZExtIn) {
       ZExtIn = N00In;
       SExtIn = N01In;
     }
     if (ZExtIn != N00In || SExtIn != N01In ||
         ZExtIn != N10In || SExtIn != N11In)
       return SDValue();
   }
 
   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                          ArrayRef<SDValue> Ops) {
     // Shrink by adding truncate nodes and let DAGCombine fold with the
     // sources.
     EVT InVT = Ops[0].getValueType();
     assert(InVT.getScalarType() == MVT::i8 &&
            "Unexpected scalar element type");
     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                  InVT.getVectorNumElements() / 2);
     return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
   };
   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
                           PMADDBuilder);
 }
 
 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
   SDLoc DL(N);
 
   // Attempt to pre-truncate inputs to arithmetic ops instead.
   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
     return V;
 
   // Try to detect AVG pattern first.
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
   // Try to detect PMADD
   if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
     return PMAdd;
 
   // Try to combine truncation with signed/unsigned saturation.
   if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
     return Val;
 
   // Try to combine PMULHUW/PMULHW for vXi16.
   if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
     return V;
 
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
     SDValue BCSrc = Src.getOperand(0);
     if (BCSrc.getValueType() == MVT::x86mmx)
       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
   }
 
   // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
     return V;
 
   return combineVectorTruncation(N, DAG, Subtarget);
 }
 
 /// Returns the negated value if the node \p N flips sign of FP value.
 ///
 /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
 /// or FSUB(0, x)
 /// AVX512F does not have FXOR, so FNEG is lowered as
 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
 /// In this case we go though all bitcasts.
 /// This also recognizes splat of a negated value and returns the splat of that
 /// value.
 static SDValue isFNEG(SelectionDAG &DAG, SDNode *N) {
   if (N->getOpcode() == ISD::FNEG)
     return N->getOperand(0);
 
   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
   auto VT = Op->getValueType(0);
   if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
     // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
     // of this is VECTOR_SHUFFLE(-VEC1, UNDEF).  The mask can be anything here.
     if (!SVOp->getOperand(1).isUndef())
       return SDValue();
     if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode()))
       return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
                                   SVOp->getMask());
     return SDValue();
   }
   unsigned Opc = Op.getOpcode();
   if (Opc == ISD::INSERT_VECTOR_ELT) {
     // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
     // -V, INDEX).
     SDValue InsVector = Op.getOperand(0);
     SDValue InsVal = Op.getOperand(1);
     if (!InsVector.isUndef())
       return SDValue();
     if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode()))
       return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
                          NegInsVal, Op.getOperand(2));
     return SDValue();
   }
 
   if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
     return SDValue();
 
   SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
   if (!Op1.getValueType().isFloatingPoint())
     return SDValue();
 
   SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
 
   // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
   // masks. For FSUB, we have to check if constant bits of Op0 are sign bit
   // masks and hence we swap the operands.
   if (Opc == ISD::FSUB)
     std::swap(Op0, Op1);
 
   APInt UndefElts;
   SmallVector<APInt, 16> EltBits;
   // Extract constant bits and see if they are all sign bit masks. Ignore the
   // undef elements.
   if (getTargetConstantBitsFromNode(Op1, Op1.getScalarValueSizeInBits(),
                                     UndefElts, EltBits,
                                     /* AllowWholeUndefs */ true,
                                     /* AllowPartialUndefs */ false)) {
     for (unsigned I = 0, E = EltBits.size(); I < E; I++)
       if (!UndefElts[I] && !EltBits[I].isSignMask())
         return SDValue();
 
     return peekThroughBitcasts(Op0);
   }
 
   return SDValue();
 }
 
 /// Do target-specific dag combines on floating point negations.
 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   EVT OrigVT = N->getValueType(0);
   SDValue Arg = isFNEG(DAG, N);
   if (!Arg)
     return SDValue();
 
   EVT VT = Arg.getValueType();
   EVT SVT = VT.getScalarType();
   SDLoc DL(N);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   // If we're negating a FMUL node on a target with FMA, then we can avoid the
   // use of a constant by performing (-0 - A*B) instead.
   // FIXME: Check rounding control flags as well once it becomes available.
   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
                                   Arg.getOperand(1), Zero);
     return DAG.getBitcast(OrigVT, NewNode);
   }
 
   // If we're negating an FMA node, then we can adjust the
   // instruction to include the extra negation.
   unsigned NewOpcode = 0;
   if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
     switch (Arg.getOpcode()) {
     case ISD::FMA:             NewOpcode = X86ISD::FNMSUB;       break;
     case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
     case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
     case X86ISD::FNMSUB:       NewOpcode = ISD::FMA;             break;
     case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
     case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
     case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
     case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
     // We can't handle scalar intrinsic node here because it would only
     // invert one element and not the whole vector. But we could try to handle
     // a negation of the lower element only.
     }
   }
   if (NewOpcode)
     return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
                                               Arg.getNode()->ops()));
 
   return SDValue();
 }
 
 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
   // If we have integer vector types available, use the integer opcodes.
   if (!VT.isVector() || !Subtarget.hasSSE2())
     return SDValue();
 
   SDLoc dl(N);
 
   unsigned IntBits = VT.getScalarSizeInBits();
   MVT IntSVT = MVT::getIntegerVT(IntBits);
   MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
 
   SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
   SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
   unsigned IntOpcode;
   switch (N->getOpcode()) {
   default: llvm_unreachable("Unexpected FP logic op");
   case X86ISD::FOR:   IntOpcode = ISD::OR; break;
   case X86ISD::FXOR:  IntOpcode = ISD::XOR; break;
   case X86ISD::FAND:  IntOpcode = ISD::AND; break;
   case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
   }
   SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
   return DAG.getBitcast(VT, IntOp);
 }
 
 
 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() != ISD::XOR)
     return SDValue();
 
   SDValue LHS = N->getOperand(0);
   auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
     return SDValue();
 
   X86::CondCode NewCC = X86::GetOppositeBranchCondition(
       X86::CondCode(LHS->getConstantOperandVal(0)));
   SDLoc DL(N);
   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
 }
 
 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   // If this is SSE1 only convert to FXOR to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
       N->getValueType(0) == MVT::v4i32) {
     return DAG.getBitcast(
         MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
                                 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
   }
 
   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
     return Cmp;
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue SetCC = foldXor1SetCC(N, DAG))
     return SetCC;
 
   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
     return RV;
 
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
   return combineFneg(N, DAG, Subtarget);
 }
 
 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   unsigned NumBits = VT.getSizeInBits();
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // TODO - Constant Folding.
   if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
     // Reduce Cst1 to the bottom 16-bits.
     // NOTE: SimplifyDemandedBits won't do this for constants.
     const APInt &Val1 = Cst1->getAPIntValue();
     APInt MaskedVal1 = Val1 & 0xFFFF;
     if (MaskedVal1 != Val1)
       return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
                          DAG.getConstant(MaskedVal1, SDLoc(N), VT));
   }
 
   // Only bottom 16-bits of the control bits are required.
   APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
   if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 static bool isNullFPScalarOrVectorConst(SDValue V) {
   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
 }
 
 /// If a value is a scalar FP zero or a vector FP zero (potentially including
 /// undefined elements), return a zero constant that may be used to fold away
 /// that value. In the case of a vector, the returned constant will not contain
 /// undefined elements even if the input parameter does. This makes it suitable
 /// to be used as a replacement operand with operations (eg, bitwise-and) where
 /// an undef should not propagate.
 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
                                         const X86Subtarget &Subtarget) {
   if (!isNullFPScalarOrVectorConst(V))
     return SDValue();
 
   if (V.getValueType().isVector())
     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
 
   return V;
 }
 
 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
         (VT == MVT::f64 && Subtarget.hasSSE2()) ||
         (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
     return SDValue();
 
   auto isAllOnesConstantFP = [](SDValue V) {
     if (V.getSimpleValueType().isVector())
       return ISD::isBuildVectorAllOnes(V.getNode());
     auto *C = dyn_cast<ConstantFPSDNode>(V);
     return C && C->getConstantFPValue()->isAllOnesValue();
   };
 
   // fand (fxor X, -1), Y --> fandn X, Y
   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
 
   // fand X, (fxor Y, -1) --> fandn Y, X
   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
 
   return SDValue();
 }
 
 /// Do target-specific dag combines on X86ISD::FAND nodes.
 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   // FAND(0.0, x) -> 0.0
   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
     return V;
 
   // FAND(x, 0.0) -> 0.0
   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
     return V;
 
   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
     return V;
 
   return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FANDN nodes.
 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   // FANDN(0.0, x) -> x
   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
     return N->getOperand(1);
 
   // FANDN(x, 0.0) -> 0.0
   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
     return V;
 
   return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
 
   // F[X]OR(0.0, x) -> x
   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
     return N->getOperand(1);
 
   // F[X]OR(x, 0.0) -> x
   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
     return N->getOperand(0);
 
   if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
     return NewVal;
 
   return lowerX86FPLogicOp(N, DAG, Subtarget);
 }
 
 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
 
   // Only perform optimizations if UnsafeMath is used.
   if (!DAG.getTarget().Options.UnsafeFPMath)
     return SDValue();
 
   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
   // into FMINC and FMAXC, which are Commutative operations.
   unsigned NewOp = 0;
   switch (N->getOpcode()) {
     default: llvm_unreachable("unknown opcode");
     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   }
 
   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
                      N->getOperand(0), N->getOperand(1));
 }
 
 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   if (Subtarget.useSoftFloat())
     return SDValue();
 
   // TODO: If an operand is already known to be a NaN or not a NaN, this
   //       should be an optional swap and FMAX/FMIN.
 
   EVT VT = N->getValueType(0);
   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDLoc DL(N);
   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
 
   // If we don't have to respect NaN inputs, this is a direct translation to x86
   // min/max instructions.
   if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
     return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
 
   // If we have to respect NaN inputs, this takes at least 3 instructions.
   // Favor a library call when operating on a scalar and minimizing code size.
   if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
     return SDValue();
 
   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
       DAG.getDataLayout(), *DAG.getContext(), VT);
 
   // There are 4 possibilities involving NaN inputs, and these are the required
   // outputs:
   //                   Op1
   //               Num     NaN
   //            ----------------
   //       Num  |  Max  |  Op0 |
   // Op0        ----------------
   //       NaN  |  Op1  |  NaN |
   //            ----------------
   //
   // The SSE FP max/min instructions were not designed for this case, but rather
   // to implement:
   //   Min = Op1 < Op0 ? Op1 : Op0
   //   Max = Op1 > Op0 ? Op1 : Op0
   //
   // So they always return Op0 if either input is a NaN. However, we can still
   // use those instructions for fmaxnum by selecting away a NaN input.
 
   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
 
   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
   // are NaN, the NaN value of Op1 is the result.
   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
 }
 
 static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI) {
   EVT VT = N->getValueType(0);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   APInt KnownUndef, KnownZero;
   APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, KnownUndef,
                                      KnownZero, DCI))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
 
   // ANDNP(0, x) -> x
   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
     return N->getOperand(1);
 
   // ANDNP(x, 0) -> 0
   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   // Turn ANDNP back to AND if input is inverted.
   if (VT.isVector() && N->getOperand(0).getOpcode() == ISD::XOR &&
       ISD::isBuildVectorAllOnes(N->getOperand(0).getOperand(1).getNode())) {
     return DAG.getNode(ISD::AND, SDLoc(N), VT,
                        N->getOperand(0).getOperand(0), N->getOperand(1));
   }
 
   // Attempt to recursively combine a bitmask ANDNP with shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
     SDValue Op(N, 0);
     if (SDValue Res = combineX86ShufflesRecursively(
             {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
             /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget))
       return Res;
   }
 
   return SDValue();
 }
 
 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // BT ignores high bits in the bit index operand.
   unsigned BitWidth = N1.getValueSizeInBits();
   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
   if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
     return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
 
   return SDValue();
 }
 
 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
 
   if (ExtraVT != MVT::i16)
     return SDValue();
 
   // Look through single use any_extends.
   if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
     N0 = N0.getOperand(0);
 
   // See if we have a single use cmov.
   if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
     return SDValue();
 
   SDValue CMovOp0 = N0.getOperand(0);
   SDValue CMovOp1 = N0.getOperand(1);
 
   // Make sure both operands are constants.
   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
       !isa<ConstantSDNode>(CMovOp1.getNode()))
     return SDValue();
 
   SDLoc DL(N);
 
   // If we looked through an any_extend above, add one to the constants.
   if (N0.getValueType() != VT) {
     CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
     CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
   }
 
   CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
   CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
 
   return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
                      N0.getOperand(2), N0.getOperand(3));
 }
 
 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   if (SDValue V = combineSextInRegCmov(N, DAG))
     return V;
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
   SDLoc dl(N);
 
   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   // both SSE and AVX2 since there is no sign-extended shift right
   // operation on a vector with 64-bit elements.
   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
 
     // EXTLOAD has a better solution on AVX2,
     // it may be replaced with X86ISD::VSEXT node.
     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
       if (!ISD::isNormalLoad(N00.getNode()))
         return SDValue();
 
     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
                                   N00, N1);
       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
     }
   }
   return SDValue();
 }
 
 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
 /// opportunities to combine math ops, use an LEA, or use a complex addressing
 /// mode. This can eliminate extend, add, and shift instructions.
 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
       Ext->getOpcode() != ISD::ZERO_EXTEND)
     return SDValue();
 
   // TODO: This should be valid for other integer types.
   EVT VT = Ext->getValueType(0);
   if (VT != MVT::i64)
     return SDValue();
 
   SDValue Add = Ext->getOperand(0);
   if (Add.getOpcode() != ISD::ADD)
     return SDValue();
 
   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
   bool NSW = Add->getFlags().hasNoSignedWrap();
   bool NUW = Add->getFlags().hasNoUnsignedWrap();
 
   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
   // into the 'zext'
   if ((Sext && !NSW) || (!Sext && !NUW))
     return SDValue();
 
   // Having a constant operand to the 'add' ensures that we are not increasing
   // the instruction count because the constant is extended for free below.
   // A constant operand can also become the displacement field of an LEA.
   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
   if (!AddOp1)
     return SDValue();
 
   // Don't make the 'add' bigger if there's no hope of combining it with some
   // other 'add' or 'shl' instruction.
   // TODO: It may be profitable to generate simpler LEA instructions in place
   // of single 'add' instructions, but the cost model for selecting an LEA
   // currently has a high threshold.
   bool HasLEAPotential = false;
   for (auto *User : Ext->uses()) {
     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
       HasLEAPotential = true;
       break;
     }
   }
   if (!HasLEAPotential)
     return SDValue();
 
   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
   SDValue AddOp0 = Add.getOperand(0);
   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
 
   // The wider add is guaranteed to not wrap because both operands are
   // sign-extended.
   SDNodeFlags Flags;
   Flags.setNoSignedWrap(NSW);
   Flags.setNoUnsignedWrap(NUW);
   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
 }
 
 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
 // operands and the result of CMOV is not used anywhere else - promote CMOV
 // itself instead of promoting its result. This could be beneficial, because:
 //     1) X86TargetLowering::EmitLoweredSelect later can do merging of two
 //        (or more) pseudo-CMOVs only when they go one-after-another and
 //        getting rid of result extension code after CMOV will help that.
 //     2) Promotion of constant CMOV arguments is free, hence the
 //        {ANY,SIGN,ZERO}_EXTEND will just be deleted.
 //     3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
 //        promotion is also good in terms of code-size.
 //        (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
 //         promotion).
 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
   SDValue CMovN = Extend->getOperand(0);
   if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
     return SDValue();
 
   EVT TargetVT = Extend->getValueType(0);
   unsigned ExtendOpcode = Extend->getOpcode();
   SDLoc DL(Extend);
 
   EVT VT = CMovN.getValueType();
   SDValue CMovOp0 = CMovN.getOperand(0);
   SDValue CMovOp1 = CMovN.getOperand(1);
 
   if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
       !isa<ConstantSDNode>(CMovOp1.getNode()))
     return SDValue();
 
   // Only extend to i32 or i64.
   if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
     return SDValue();
 
   // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
   // are free.
   if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
     return SDValue();
 
   // If this a zero extend to i64, we should only extend to i32 and use a free
   // zero extend to finish.
   EVT ExtendVT = TargetVT;
   if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
     ExtendVT = MVT::i32;
 
   CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
   CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
 
   SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
                             CMovN.getOperand(2), CMovN.getOperand(3));
 
   // Finish extending if needed.
   if (ExtendVT != TargetVT)
     Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
 
   return Res;
 }
 
 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
 // This is more or less the reverse of combineBitcastvxi1.
 static SDValue
 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
                                TargetLowering::DAGCombinerInfo &DCI,
                                const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
       Opcode != ISD::ANY_EXTEND)
     return SDValue();
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SVT = VT.getScalarType();
   EVT InSVT = N0.getValueType().getScalarType();
   unsigned EltSizeInBits = SVT.getSizeInBits();
 
   // Input type must be extending a bool vector (bit-casted from a scalar
   // integer) to legal integer types.
   if (!VT.isVector())
     return SDValue();
   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
     return SDValue();
   if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
   EVT SclVT = N0.getOperand(0).getValueType();
   if (!SclVT.isScalarInteger())
     return SDValue();
 
   SDLoc DL(N);
   SDValue Vec;
   SmallVector<int, 32> ShuffleMask;
   unsigned NumElts = VT.getVectorNumElements();
   assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
 
   // Broadcast the scalar integer to the vector elements.
   if (NumElts > EltSizeInBits) {
     // If the scalar integer is greater than the vector element size, then we
     // must split it down into sub-sections for broadcasting. For example:
     //   i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
     //   i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
     assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
     unsigned Scale = NumElts / EltSizeInBits;
     EVT BroadcastVT =
         EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
     Vec = DAG.getBitcast(VT, Vec);
 
     for (unsigned i = 0; i != Scale; ++i)
       ShuffleMask.append(EltSizeInBits, i);
   } else {
     // For smaller scalar integers, we can simply any-extend it to the vector
     // element size (we don't care about the upper bits) and broadcast it to all
     // elements.
     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
     ShuffleMask.append(NumElts, 0);
   }
   Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
 
   // Now, mask the relevant bit in each element.
   SmallVector<SDValue, 32> Bits;
   for (unsigned i = 0; i != NumElts; ++i) {
     int BitIdx = (i % EltSizeInBits);
     APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
     Bits.push_back(DAG.getConstant(Bit, DL, SVT));
   }
   SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
   Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
 
   // Compare against the bitmask and extend the result.
   EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
   Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
   Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
 
   // For SEXT, this is now done, otherwise shift the result down for
   // zero-extension.
   if (Opcode == ISD::SIGN_EXTEND)
     return Vec;
   return DAG.getNode(ISD::SRL, DL, VT, Vec,
                      DAG.getConstant(EltSizeInBits - 1, DL, VT));
 }
 
 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
 /// with UNDEFs) of the input to vectors of the same size as the target type
 /// which then extends the lowest elements.
 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const X86Subtarget &Subtarget) {
   if (ExperimentalVectorWideningLegalization)
     return SDValue();
 
   unsigned Opcode = N->getOpcode();
   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
     return SDValue();
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT SVT = VT.getScalarType();
   EVT InVT = N0.getValueType();
   EVT InSVT = InVT.getScalarType();
 
   // FIXME: Generic DAGCombiner previously had a bug that would cause a
   // sign_extend of setcc to sometimes return the original node and tricked it
   // into thinking CombineTo was used which prevented the target combines from
   // running.
   // Earlying out here to avoid regressions like this
   //  (v4i32 (sext (v4i1 (setcc (v4i16)))))
   // Becomes
   //  (v4i32 (sext_invec (v8i16 (concat (v4i16 (setcc (v4i16))), undef))))
   // Type legalized to
   //  (v4i32 (sext_invec (v8i16 (trunc_invec (v4i32 (setcc (v4i32)))))))
   // Leading to a packssdw+pmovsxwd
   // We could write a DAG combine to fix this, but really we shouldn't be
   // creating sext_invec that's forcing v8i16 into the DAG.
   if (N0.getOpcode() == ISD::SETCC)
     return SDValue();
 
   // Input type must be a vector and we must be extending legal integer types.
   if (!VT.isVector() || VT.getVectorNumElements() < 2)
     return SDValue();
   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
     return SDValue();
   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
     return SDValue();
 
   // If the input/output types are both legal then we have at least AVX1 and
   // we will be able to use SIGN_EXTEND/ZERO_EXTEND directly.
   if (DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
     return SDValue();
 
   SDLoc DL(N);
 
   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
     EVT InVT = N.getValueType();
     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
                                  Size / InVT.getScalarSizeInBits());
     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
                                   DAG.getUNDEF(InVT));
     Opnds[0] = N;
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
   };
 
   // If target-size is less than 128-bits, extend to a type that would extend
   // to 128 bits, extend that and extract the original target vector.
   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
     unsigned Scale = 128 / VT.getSizeInBits();
     EVT ExVT =
         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
                        DAG.getIntPtrConstant(0, DL));
   }
 
   // If target-size is 128-bits (or 256-bits on AVX target), then convert to
   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
   // Also use this if we don't have SSE41 to allow the legalizer do its job.
   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
       (VT.is256BitVector() && Subtarget.hasAVX()) ||
       (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
     Opcode = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
                                         : ISD::ZERO_EXTEND_VECTOR_INREG;
     return DAG.getNode(Opcode, DL, VT, ExOp);
   }
 
   auto SplitAndExtendInReg = [&](unsigned SplitSize) {
     unsigned NumVecs = VT.getSizeInBits() / SplitSize;
     unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
 
     unsigned IROpc = Opcode == ISD::SIGN_EXTEND ? ISD::SIGN_EXTEND_VECTOR_INREG
                                                 : ISD::ZERO_EXTEND_VECTOR_INREG;
 
     SmallVector<SDValue, 8> Opnds;
     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
                                    DAG.getIntPtrConstant(Offset, DL));
       SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
       SrcVec = DAG.getNode(IROpc, DL, SubVT, SrcVec);
       Opnds.push_back(SrcVec);
     }
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
   };
 
   // On pre-AVX targets, split into 128-bit nodes of
   // ISD::*_EXTEND_VECTOR_INREG.
   if (!Subtarget.hasAVX() && !(VT.getSizeInBits() % 128))
     return SplitAndExtendInReg(128);
 
   // On pre-AVX512 targets, split into 256-bit nodes of
   // ISD::*_EXTEND_VECTOR_INREG.
   if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
     return SplitAndExtendInReg(256);
 
   return SDValue();
 }
 
 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
 // result type.
 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
   // Only do this combine with AVX512 for vector extends.
   if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
     return SDValue();
 
   // Only combine legal element types.
   EVT SVT = VT.getVectorElementType();
   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
     return SDValue();
 
   // We can only do this if the vector size in 256 bits or less.
   unsigned Size = VT.getSizeInBits();
   if (Size > 256)
     return SDValue();
 
   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
   // that's the only integer compares with we have.
   ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
   if (ISD::isUnsignedIntSetCC(CC))
     return SDValue();
 
   // Only do this combine if the extension will be fully consumed by the setcc.
   EVT N00VT = N0.getOperand(0).getValueType();
   EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
   if (Size != MatchingVecType.getSizeInBits())
     return SDValue();
 
   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
 
   if (N->getOpcode() == ISD::ZERO_EXTEND)
     Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
 
   return Res;
 }
 
 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = N0.getValueType();
   SDLoc DL(N);
 
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
     return V;
 
   if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
       isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
     // Invert and sign-extend a boolean is the same as zero-extend and subtract
     // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
     // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
     // sext (xor Bool, -1) --> sub (zext Bool), 1
     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
     return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
   }
 
   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
   if (VT.isVector())
     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
     return NewAdd;
 
   return SDValue();
 }
 
 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
   if (NegMul) {
     switch (Opcode) {
     default: llvm_unreachable("Unexpected opcode");
     case ISD::FMA:             Opcode = X86ISD::FNMADD;       break;
     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMADD_RND;   break;
     case X86ISD::FMSUB:        Opcode = X86ISD::FNMSUB;       break;
     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
     case X86ISD::FNMADD:       Opcode = ISD::FMA;             break;
     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMADD_RND;    break;
     case X86ISD::FNMSUB:       Opcode = X86ISD::FMSUB;        break;
     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMSUB_RND;    break;
     }
   }
 
   if (NegAcc) {
     switch (Opcode) {
     default: llvm_unreachable("Unexpected opcode");
     case ISD::FMA:             Opcode = X86ISD::FMSUB;        break;
     case X86ISD::FMADD_RND:    Opcode = X86ISD::FMSUB_RND;    break;
     case X86ISD::FMSUB:        Opcode = ISD::FMA;             break;
     case X86ISD::FMSUB_RND:    Opcode = X86ISD::FMADD_RND;    break;
     case X86ISD::FNMADD:       Opcode = X86ISD::FNMSUB;       break;
     case X86ISD::FNMADD_RND:   Opcode = X86ISD::FNMSUB_RND;   break;
     case X86ISD::FNMSUB:       Opcode = X86ISD::FNMADD;       break;
     case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FNMADD_RND;   break;
     }
   }
 
   return Opcode;
 }
 
 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   EVT ScalarVT = VT.getScalarType();
   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
     return SDValue();
 
   SDValue A = N->getOperand(0);
   SDValue B = N->getOperand(1);
   SDValue C = N->getOperand(2);
 
   auto invertIfNegative = [&DAG](SDValue &V) {
     if (SDValue NegVal = isFNEG(DAG, V.getNode())) {
       V = DAG.getBitcast(V.getValueType(), NegVal);
       return true;
     }
     // Look through extract_vector_elts. If it comes from an FNEG, create a
     // new extract from the FNEG input.
     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         isNullConstant(V.getOperand(1))) {
       if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) {
         NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
                         NegVal, V.getOperand(1));
         return true;
       }
     }
 
     return false;
   };
 
   // Do not convert the passthru input of scalar intrinsics.
   // FIXME: We could allow negations of the lower element only.
   bool NegA = invertIfNegative(A);
   bool NegB = invertIfNegative(B);
   bool NegC = invertIfNegative(C);
 
   if (!NegA && !NegB && !NegC)
     return SDValue();
 
   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
 
   if (N->getNumOperands() == 4)
     return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
   return DAG.getNode(NewOpcode, dl, VT, A, B, C);
 }
 
 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode());
   if (!NegVal)
     return SDValue();
 
   unsigned NewOpcode;
   switch (N->getOpcode()) {
   default: llvm_unreachable("Unexpected opcode!");
   case X86ISD::FMADDSUB:     NewOpcode = X86ISD::FMSUBADD;     break;
   case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
   case X86ISD::FMSUBADD:     NewOpcode = X86ISD::FMADDSUB;     break;
   case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
   }
 
   if (N->getNumOperands() == 4)
     return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
                        NegVal, N->getOperand(3));
   return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
                      NegVal);
 }
 
 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
   //           (and (i32 x86isd::setcc_carry), 1)
   // This eliminates the zext. This transformation is necessary because
   // ISD::SETCC is always legalized to i8.
   SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   if (N0.getOpcode() == ISD::AND &&
       N0.hasOneUse() &&
       N0.getOperand(0).hasOneUse()) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
       if (!isOneConstant(N0.getOperand(1)))
         return SDValue();
       return DAG.getNode(ISD::AND, dl, VT,
                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
                                      N00.getOperand(0), N00.getOperand(1)),
                          DAG.getConstant(1, dl, VT));
     }
   }
 
   if (N0.getOpcode() == ISD::TRUNCATE &&
       N0.hasOneUse() &&
       N0.getOperand(0).hasOneUse()) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
       return DAG.getNode(ISD::AND, dl, VT,
                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
                                      N00.getOperand(0), N00.getOperand(1)),
                          DAG.getConstant(1, dl, VT));
     }
   }
 
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
   if (DCI.isBeforeLegalizeOps())
     if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
       return V;
 
   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
   if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
   if (VT.isVector())
     if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))
       return R;
 
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
     return NewAdd;
 
   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
     return R;
 
   return SDValue();
 }
 
 /// Try to map a 128-bit or larger integer comparison to vector instructions
 /// before type legalization splits it up into chunks.
 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
                                                const X86Subtarget &Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
 
   // We're looking for an oversized integer equality comparison.
   SDValue X = SetCC->getOperand(0);
   SDValue Y = SetCC->getOperand(1);
   EVT OpVT = X.getValueType();
   unsigned OpSize = OpVT.getSizeInBits();
   if (!OpVT.isScalarInteger() || OpSize < 128)
     return SDValue();
 
   // Ignore a comparison with zero because that gets special treatment in
   // EmitTest(). But make an exception for the special case of a pair of
   // logically-combined vector-sized operands compared to zero. This pattern may
   // be generated by the memcmp expansion pass with oversized integer compares
   // (see PR33325).
   bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
                           X.getOperand(0).getOpcode() == ISD::XOR &&
                           X.getOperand(1).getOpcode() == ISD::XOR;
   if (isNullConstant(Y) && !IsOrXorXorCCZero)
     return SDValue();
 
   // Bail out if we know that this is not really just an oversized integer.
   if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
       peekThroughBitcasts(Y).getValueType() == MVT::f128)
     return SDValue();
 
   // TODO: Use PXOR + PTEST for SSE4.1 or later?
   EVT VT = SetCC->getValueType(0);
   SDLoc DL(SetCC);
   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
       (OpSize == 256 && Subtarget.hasAVX2()) ||
       (OpSize == 512 && Subtarget.useAVX512Regs())) {
     EVT VecVT = OpSize == 512 ? MVT::v16i32 :
                 OpSize == 256 ? MVT::v32i8 :
                                 MVT::v16i8;
     EVT CmpVT = OpSize == 512 ? MVT::v16i1 : VecVT;
     SDValue Cmp;
     if (IsOrXorXorCCZero) {
       // This is a bitwise-combined equality comparison of 2 pairs of vectors:
       // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
       // Use 2 vector equality compares and 'and' the results before doing a
       // MOVMSK.
       SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
       SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
       SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
       SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
       SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
       SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
       Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
     } else {
       SDValue VecX = DAG.getBitcast(VecVT, X);
       SDValue VecY = DAG.getBitcast(VecVT, Y);
       Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
     }
     // For 512-bits we want to emit a setcc that will lower to kortest.
     if (OpSize == 512)
       return DAG.getSetCC(DL, VT, DAG.getBitcast(MVT::i16, Cmp),
                           DAG.getConstant(0xFFFF, DL, MVT::i16), CC);
     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
     // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
     // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
     SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
                                     MVT::i32);
     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
   }
 
   return SDValue();
 }
 
 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT OpVT = LHS.getValueType();
   SDLoc DL(N);
 
   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
     // 0-x == y --> x+y == 0
     // 0-x != y --> x+y != 0
     if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
         LHS.hasOneUse()) {
       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
     // x == 0-y --> x+y == 0
     // x != 0-y --> x+y != 0
     if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
         RHS.hasOneUse()) {
       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
 
     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
       return V;
   }
 
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
     // Put build_vectors on the right.
     if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
       std::swap(LHS, RHS);
       CC = ISD::getSetCCSwappedOperands(CC);
     }
 
     bool IsSEXT0 =
         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
         (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
 
     if (IsSEXT0 && IsVZero1) {
       assert(VT == LHS.getOperand(0).getValueType() &&
              "Uexpected operand type");
       if (CC == ISD::SETGT)
         return DAG.getConstant(0, DL, VT);
       if (CC == ISD::SETLE)
         return DAG.getConstant(1, DL, VT);
       if (CC == ISD::SETEQ || CC == ISD::SETGE)
         return DAG.getNOT(DL, LHS.getOperand(0), VT);
 
       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
              "Unexpected condition code!");
       return LHS.getOperand(0);
     }
   }
 
   // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
   // pre-promote its result type since vXi1 vectors don't get promoted
   // during type legalization.
   // NOTE: The element count check is to ignore operand types that need to
   // go through type promotion to a 128-bit vector.
   if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
       VT.getVectorElementType() == MVT::i1 &&
       (ExperimentalVectorWideningLegalization ||
        VT.getVectorNumElements() > 4) &&
       (OpVT.getVectorElementType() == MVT::i8 ||
        OpVT.getVectorElementType() == MVT::i16)) {
     SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
                                 N->getOperand(2));
     return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
   }
 
   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
   // to avoid scalarization via legalization because v4i32 is not a legal type.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
       LHS.getValueType() == MVT::v4f32)
     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
 
   return SDValue();
 }
 
 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
   SDValue Src = N->getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
   MVT VT = N->getSimpleValueType(0);
 
   // Perform constant folding.
   if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
     assert(VT== MVT::i32 && "Unexpected result type");
     APInt Imm(32, 0);
     for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
       SDValue In = Src.getOperand(Idx);
       if (!In.isUndef() &&
           cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
         Imm.setBit(Idx);
     }
     return DAG.getConstant(Imm, SDLoc(N), VT);
   }
 
   // Look through int->fp bitcasts that don't change the element width.
   if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse() &&
       SrcVT.isFloatingPoint() &&
       Src.getOperand(0).getValueType() ==
         EVT(SrcVT).changeVectorElementTypeToInteger())
     Src = Src.getOperand(0);
 
   // Simplify the inputs.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
   if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
     return SDValue(N, 0);
 
   // Combine (movmsk (setne (and X, (1 << C)), 0)) -> (movmsk (X << C)).
   // Only do this when the setcc input and output types are the same and the
   // setcc and the 'and' node have a single use.
   // FIXME: Support 256-bits with AVX1. The movmsk is split, but the and isn't.
   APInt SplatVal;
   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
       Src.getOperand(0).getValueType() == Src.getValueType() &&
       cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETNE &&
       ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
       Src.getOperand(0).getOpcode() == ISD::AND) {
     SDValue And = Src.getOperand(0);
     if (And.hasOneUse() &&
         ISD::isConstantSplatVector(And.getOperand(1).getNode(), SplatVal) &&
         SplatVal.isPowerOf2()) {
       MVT VT = Src.getSimpleValueType();
       unsigned BitWidth = VT.getScalarSizeInBits();
       unsigned ShAmt = BitWidth - SplatVal.logBase2() - 1;
       SDLoc DL(And);
       SDValue X = And.getOperand(0);
       // If the element type is i8, we need to bitcast to i16 to use a legal
       // shift. If we wait until lowering we end up with an extra and to bits
       // from crossing the 8-bit elements, but we don't care about that here.
       if (VT.getVectorElementType() == MVT::i8) {
         VT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
         X = DAG.getBitcast(VT, X);
       }
       SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, X,
                                 DAG.getConstant(ShAmt, DL, VT));
       SDValue Cast = DAG.getBitcast(SrcVT, Shl);
       return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), N->getValueType(0), Cast);
     }
   }
 
   return SDValue();
 }
 
 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget &Subtarget) {
   SDLoc DL(N);
 
   if (DCI.isBeforeLegalizeOps()) {
     SDValue Index = N->getOperand(4);
     // Remove any sign extends from 32 or smaller to larger than 32.
     // Only do this before LegalizeOps in case we need the sign extend for
     // legalization.
     if (Index.getOpcode() == ISD::SIGN_EXTEND) {
       if (Index.getScalarValueSizeInBits() > 32 &&
           Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
         SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
         NewOps[4] = Index.getOperand(0);
         SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
         if (Res == N) {
           // The original sign extend has less users, add back to worklist in
           // case it needs to be removed
           DCI.AddToWorklist(Index.getNode());
           DCI.AddToWorklist(N);
         }
         return SDValue(Res, 0);
       }
     }
 
     // Make sure the index is either i32 or i64
     unsigned ScalarSize = Index.getScalarValueSizeInBits();
     if (ScalarSize != 32 && ScalarSize != 64) {
       MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
       EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                                    Index.getValueType().getVectorNumElements());
       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
       SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
       NewOps[4] = Index;
       SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
       if (Res == N)
         DCI.AddToWorklist(N);
       return SDValue(Res, 0);
     }
 
     // Try to remove zero extends from 32->64 if we know the sign bit of
     // the input is zero.
     if (Index.getOpcode() == ISD::ZERO_EXTEND &&
         Index.getScalarValueSizeInBits() == 64 &&
         Index.getOperand(0).getScalarValueSizeInBits() == 32) {
       if (DAG.SignBitIsZero(Index.getOperand(0))) {
         SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
         NewOps[4] = Index.getOperand(0);
         SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
         if (Res == N) {
           // The original sign extend has less users, add back to worklist in
           // case it needs to be removed
           DCI.AddToWorklist(Index.getNode());
           DCI.AddToWorklist(N);
         }
         return SDValue(Res, 0);
       }
     }
   }
 
   // With AVX2 we only demand the upper bit of the mask.
   if (!Subtarget.hasAVX512()) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SDValue Mask = N->getOperand(2);
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
       return SDValue(N, 0);
   }
 
   return SDValue();
 }
 
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
 
   // Try to simplify the EFLAGS and condition code operands.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
     return getSETCC(CC, Flags, DL, DAG);
 
   return SDValue();
 }
 
 /// Optimize branch condition evaluation.
 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue EFLAGS = N->getOperand(3);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
 
   // Try to simplify the EFLAGS and condition code operands.
   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
   // RAUW them under us.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
                        N->getOperand(1), Cond, Flags);
   }
 
   return SDValue();
 }
 
 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
                                                   SelectionDAG &DAG) {
   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   // optimize away operation when it's from a constant.
   //
   // The general transformation is:
   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
   //       AND(VECTOR_CMP(x,y), constant2)
   //    constant2 = UNARYOP(constant)
 
   // Early exit if this isn't a vector operation, the operand of the
   // unary operation isn't a bitwise AND, or if the sizes of the operations
   // aren't the same.
   EVT VT = N->getValueType(0);
   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
       VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
     return SDValue();
 
   // Now check that the other operand of the AND is a constant. We could
   // make the transformation for non-constant splats as well, but it's unclear
   // that would be a benefit as it would not eliminate any operations, just
   // perform one more step in scalar code before moving to the vector unit.
   if (BuildVectorSDNode *BV =
           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
     // Bail out if the vector isn't a constant.
     if (!BV->isConstant())
       return SDValue();
 
     // Everything checks out. Build up the new and improved node.
     SDLoc DL(N);
     EVT IntVT = BV->getValueType(0);
     // Create a new constant of the appropriate type for the transformed
     // DAG.
     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
     // The AND node needs bitcasts to/from an integer vector type around it.
     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
                                  N->getOperand(0)->getOperand(0), MaskConst);
     SDValue Res = DAG.getBitcast(VT, NewAnd);
     return Res;
   }
 
   return SDValue();
 }
 
 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
 
   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
     SDLoc dl(N);
     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements());
     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
 
     // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   }
 
   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   // the optimization here.
   if (DAG.SignBitIsZero(Op0))
     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
 
   return SDValue();
 }
 
 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   // First try to optimize away the conversion entirely when it's
   // conditionally from a constant. Vectors only.
   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
     return Res;
 
   // Now move on to more general possibilities.
   SDValue Op0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
 
   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
   if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
     SDLoc dl(N);
     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements());
     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
   }
 
   // Without AVX512DQ we only support i64 to float scalar conversion. For both
   // vectors and scalars, see if we know that the upper bits are all the sign
   // bit, in which case we can truncate the input to i32 and convert from that.
   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
     unsigned BitWidth = InVT.getScalarSizeInBits();
     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
     if (NumSignBits >= (BitWidth - 31)) {
       EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
       if (InVT.isVector())
         TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
                                    InVT.getVectorNumElements());
       SDLoc dl(N);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
       return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
     }
   }
 
   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
   // a 32-bit target where SSE doesn't support i64->FP operations.
   if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
       Op0.getOpcode() == ISD::LOAD) {
     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
     EVT LdVT = Ld->getValueType(0);
 
     // This transformation is not supported if the result type is f16 or f128.
     if (VT == MVT::f16 || VT == MVT::f128)
       return SDValue();
 
     // If we have AVX512DQ we can use packed conversion instructions unless
     // the VT is f80.
     if (Subtarget.hasDQI() && VT != MVT::f80)
       return SDValue();
 
     if (!Ld->isVolatile() && !VT.isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
         !Subtarget.is64Bit() && LdVT == MVT::i64) {
       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
       return FILDChain;
     }
   }
   return SDValue();
 }
 
 static bool needCarryOrOverflowFlag(SDValue Flags) {
   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
 
   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
          UI != UE; ++UI) {
     SDNode *User = *UI;
 
     X86::CondCode CC;
     switch (User->getOpcode()) {
     default:
       // Be conservative.
       return true;
     case X86ISD::SETCC:
     case X86ISD::SETCC_CARRY:
       CC = (X86::CondCode)User->getConstantOperandVal(0);
       break;
     case X86ISD::BRCOND:
       CC = (X86::CondCode)User->getConstantOperandVal(2);
       break;
     case X86ISD::CMOV:
       CC = (X86::CondCode)User->getConstantOperandVal(2);
       break;
     }
 
     switch (CC) {
     default: break;
     case X86::COND_A: case X86::COND_AE:
     case X86::COND_B: case X86::COND_BE:
     case X86::COND_O: case X86::COND_NO:
     case X86::COND_G: case X86::COND_GE:
     case X86::COND_L: case X86::COND_LE:
       return true;
     }
   }
 
   return false;
 }
 
 static bool onlyZeroFlagUsed(SDValue Flags) {
   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
 
   for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
          UI != UE; ++UI) {
     SDNode *User = *UI;
 
     unsigned CCOpNo;
     switch (User->getOpcode()) {
     default:
       // Be conservative.
       return false;
     case X86ISD::SETCC:       CCOpNo = 0; break;
     case X86ISD::SETCC_CARRY: CCOpNo = 0; break;
     case X86ISD::BRCOND:      CCOpNo = 2; break;
     case X86ISD::CMOV:        CCOpNo = 2; break;
     }
 
     X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
     if (CC != X86::COND_E && CC != X86::COND_NE)
       return false;
   }
 
   return true;
 }
 
 static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {
   // Only handle test patterns.
   if (!isNullConstant(N->getOperand(1)))
     return SDValue();
 
   // If we have a CMP of a truncated binop, see if we can make a smaller binop
   // and use its flags directly.
   // TODO: Maybe we should try promoting compares that only use the zero flag
   // first if we can prove the upper bits with computeKnownBits?
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
   EVT VT = Op.getValueType();
 
   // If we have a constant logical shift that's only used in a comparison
   // against zero turn it into an equivalent AND. This allows turning it into
   // a TEST instruction later.
   if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
       Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
       onlyZeroFlagUsed(SDValue(N, 0))) {
     EVT VT = Op.getValueType();
     unsigned BitWidth = VT.getSizeInBits();
     unsigned ShAmt = Op.getConstantOperandVal(1);
     if (ShAmt < BitWidth) { // Avoid undefined shifts.
       APInt Mask = Op.getOpcode() == ISD::SRL
                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
       if (Mask.isSignedIntN(32)) {
         Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
                          DAG.getConstant(Mask, dl, VT));
         return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                            DAG.getConstant(0, dl, VT));
       }
     }
   }
 
 
   // Look for a truncate with a single use.
   if (Op.getOpcode() != ISD::TRUNCATE || !Op.hasOneUse())
     return SDValue();
 
   Op = Op.getOperand(0);
 
   // Arithmetic op can only have one use.
   if (!Op.hasOneUse())
     return SDValue();
 
   unsigned NewOpc;
   switch (Op.getOpcode()) {
   default: return SDValue();
   case ISD::AND:
     // Skip and with constant. We have special handling for and with immediate
     // during isel to generate test instructions.
     if (isa<ConstantSDNode>(Op.getOperand(1)))
       return SDValue();
     NewOpc = X86ISD::AND;
     break;
   case ISD::OR:  NewOpc = X86ISD::OR;  break;
   case ISD::XOR: NewOpc = X86ISD::XOR; break;
   case ISD::ADD:
     // If the carry or overflow flag is used, we can't truncate.
     if (needCarryOrOverflowFlag(SDValue(N, 0)))
       return SDValue();
     NewOpc = X86ISD::ADD;
     break;
   case ISD::SUB:
     // If the carry or overflow flag is used, we can't truncate.
     if (needCarryOrOverflowFlag(SDValue(N, 0)))
       return SDValue();
     NewOpc = X86ISD::SUB;
     break;
   }
 
   // We found an op we can narrow. Truncate its inputs.
   SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
   SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
 
   // Use a X86 specific opcode to avoid DAG combine messing with it.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
 
   // For AND, keep a CMP so that we can match the test pattern.
   if (NewOpc == X86ISD::AND)
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, dl, VT));
 
   // Return the flags.
   return Op.getValue(1);
 }
 
 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
     MVT VT = N->getSimpleValueType(0);
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
                        N->getOperand(0), N->getOperand(1),
                        Flags);
   }
 
   return SDValue();
 }
 
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI) {
   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
   // the result is either zero or one (depending on the input carry bit).
   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
   if (X86::isZeroNode(N->getOperand(0)) &&
       X86::isZeroNode(N->getOperand(1)) &&
       // We don't have a good way to replace an EFLAGS use, so only do this when
       // dead right now.
       SDValue(N, 1).use_empty()) {
     SDLoc DL(N);
     EVT VT = N->getValueType(0);
     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                                            DAG.getConstant(X86::COND_B, DL,
                                                            MVT::i8),
                                            N->getOperand(2)),
                                DAG.getConstant(1, DL, VT));
     return DCI.CombineTo(N, Res1, CarryOut);
   }
 
   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
     MVT VT = N->getSimpleValueType(0);
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
                        N->getOperand(0), N->getOperand(1),
                        Flags);
   }
 
   return SDValue();
 }
 
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
   bool IsSub = N->getOpcode() == ISD::SUB;
   SDValue X = N->getOperand(0);
   SDValue Y = N->getOperand(1);
 
   // If this is an add, canonicalize a zext operand to the RHS.
   // TODO: Incomplete? What if both sides are zexts?
   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
       Y.getOpcode() != ISD::ZERO_EXTEND)
     std::swap(X, Y);
 
   // Look through a one-use zext.
   bool PeekedThroughZext = false;
   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
     Y = Y.getOperand(0);
     PeekedThroughZext = true;
   }
 
   // If this is an add, canonicalize a setcc operand to the RHS.
   // TODO: Incomplete? What if both sides are setcc?
   // TODO: Should we allow peeking through a zext of the other operand?
   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
       Y.getOpcode() != X86ISD::SETCC)
     std::swap(X, Y);
 
   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
     return SDValue();
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
 
   // If X is -1 or 0, then we have an opportunity to avoid constants required in
   // the general case below.
   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
   if (ConstantX) {
     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
         (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
       // This is a complicated way to get -1 or 0 from the carry flag:
       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
                          Y.getOperand(1));
     }
 
     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
         (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
       SDValue EFLAGS = Y->getOperand(1);
       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
           EFLAGS.getValueType().isInteger() &&
           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
         // Swap the operands of a SUB, and we have the same pattern as above.
         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
         SDValue NewSub = DAG.getNode(
             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                            DAG.getConstant(X86::COND_B, DL, MVT::i8),
                            NewEFLAGS);
       }
     }
   }
 
   if (CC == X86::COND_B) {
     // X + SETB Z --> adc X, 0
     // X - SETB Z --> sbb X, 0
     return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
                        DAG.getVTList(VT, MVT::i32), X,
                        DAG.getConstant(0, DL, VT), Y.getOperand(1));
   }
 
   if (CC == X86::COND_A) {
     SDValue EFLAGS = Y->getOperand(1);
     // Try to convert COND_A into COND_B in an attempt to facilitate
     // materializing "setb reg".
     //
     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
     // cannot take an immediate as its first operand.
     //
     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
         EFLAGS.getValueType().isInteger() &&
         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
                                    EFLAGS.getNode()->getVTList(),
                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
                          DAG.getVTList(VT, MVT::i32), X,
                          DAG.getConstant(0, DL, VT), NewEFLAGS);
     }
   }
 
   if (CC != X86::COND_E && CC != X86::COND_NE)
     return SDValue();
 
   SDValue Cmp = Y.getOperand(1);
   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
       !X86::isZeroNode(Cmp.getOperand(1)) ||
       !Cmp.getOperand(0).getValueType().isInteger())
     return SDValue();
 
   SDValue Z = Cmp.getOperand(0);
   EVT ZVT = Z.getValueType();
 
   // If X is -1 or 0, then we have an opportunity to avoid constants required in
   // the general case below.
   if (ConstantX) {
     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
     // fake operands:
     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
     if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
       SDValue Zero = DAG.getConstant(0, DL, ZVT);
       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
                          SDValue(Neg.getNode(), 1));
     }
 
     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
     // with fake operands:
     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
       SDValue One = DAG.getConstant(1, DL, ZVT);
       SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                          DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
     }
   }
 
   // (cmp Z, 1) sets the carry flag if Z is 0.
   SDValue One = DAG.getConstant(1, DL, ZVT);
   SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
 
   // Add the flags type for ADC/SBB nodes.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   if (CC == X86::COND_NE)
     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
                        DAG.getConstant(-1ULL, DL, VT), Cmp1);
 
   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
                      DAG.getConstant(0, DL, VT), Cmp1);
 }
 
 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   EVT VT = N->getValueType(0);
 
   // If the vector size is less than 128, or greater than the supported RegSize,
   // do not use PMADD.
   if (!VT.isVector() || VT.getVectorNumElements() < 8)
     return SDValue();
 
   if (Op0.getOpcode() != ISD::MUL)
     std::swap(Op0, Op1);
   if (Op0.getOpcode() != ISD::MUL)
     return SDValue();
 
   ShrinkMode Mode;
   if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16)
     return SDValue();
 
   SDLoc DL(N);
   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                    VT.getVectorNumElements());
   EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                 VT.getVectorNumElements() / 2);
 
   // Madd vector size is half of the original vector size
   auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                            ArrayRef<SDValue> Ops) {
     MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
     return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
   };
 
   auto BuildPMADDWD = [&](SDValue Mul) {
     // Shrink the operands of mul.
     SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
     SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));
 
     SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
                                    PMADDWDBuilder);
     // Fill the rest of the output with 0
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
                        DAG.getConstant(0, DL, MAddVT));
   };
 
   Op0 = BuildPMADDWD(Op0);
 
   // It's possible that Op1 is also a mul we can reduce.
   if (Op1.getOpcode() == ISD::MUL &&
       canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
     Op1 = BuildPMADDWD(Op1);
   }
 
   return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
 }
 
 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   // TODO: There's nothing special about i32, any integer type above i16 should
   // work just as well.
   if (!VT.isVector() || !VT.isSimple() ||
       !(VT.getVectorElementType() == MVT::i32))
     return SDValue();
 
   unsigned RegSize = 128;
   if (Subtarget.useBWIRegs())
     RegSize = 512;
   else if (Subtarget.hasAVX())
     RegSize = 256;
 
   // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
   if (VT.getSizeInBits() / 4 > RegSize)
     return SDValue();
 
   // We know N is a reduction add, which means one of its operands is a phi.
   // To match SAD, we need the other operand to be a vector select.
   if (Op0.getOpcode() != ISD::VSELECT)
     std::swap(Op0, Op1);
   if (Op0.getOpcode() != ISD::VSELECT)
     return SDValue();
 
   auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
     // SAD pattern detected. Now build a SAD instruction and an addition for
     // reduction. Note that the number of elements of the result of SAD is less
     // than the number of elements of its input. Therefore, we could only update
     // part of elements in the reduction vector.
     SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
 
     // The output of PSADBW is a vector of i64.
     // We need to turn the vector of i64 into a vector of i32.
     // If the reduction vector is at least as wide as the psadbw result, just
     // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
     // anyway.
     MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
     if (VT.getSizeInBits() >= ResVT.getSizeInBits())
       Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
     else
       Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
 
     if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
       // Fill the upper elements with zero to match the add width.
       SDValue Zero = DAG.getConstant(0, DL, VT);
       Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
                         DAG.getIntPtrConstant(0, DL));
     }
 
     return Sad;
   };
 
   // Check whether we have an abs-diff pattern feeding into the select.
   SDValue SadOp0, SadOp1;
   if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
     return SDValue();
 
   Op0 = BuildPSADBW(SadOp0, SadOp1);
 
   // It's possible we have a sad on the other side too.
   if (Op1.getOpcode() == ISD::VSELECT &&
       detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
     Op1 = BuildPSADBW(SadOp0, SadOp1);
   }
 
   return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
 }
 
 /// Convert vector increment or decrement to sub/add with an all-ones constant:
 /// add X, <1, 1...> --> sub X, <-1, -1...>
 /// sub X, <1, 1...> --> add X, <-1, -1...>
 /// The all-ones vector constant can be materialized using a pcmpeq instruction
 /// that is commonly recognized as an idiom (has no register dependency), so
 /// that's better/smaller than loading a splat 1 constant.
 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
          "Unexpected opcode for increment/decrement transform");
 
   // Pseudo-legality check: getOnesVector() expects one of these types, so bail
   // out and wait for legalization if we have an unsupported vector length.
   EVT VT = N->getValueType(0);
   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
     return SDValue();
 
   APInt SplatVal;
   if (!isConstantSplat(N->getOperand(1), SplatVal) || !SplatVal.isOneValue())
     return SDValue();
 
   SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
   unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
   return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
 }
 
 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
                             const SDLoc &DL, EVT VT,
                             const X86Subtarget &Subtarget) {
   // Example of pattern we try to detect:
   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
   //(add (build_vector (extract_elt t, 0),
   //                   (extract_elt t, 2),
   //                   (extract_elt t, 4),
   //                   (extract_elt t, 6)),
   //     (build_vector (extract_elt t, 1),
   //                   (extract_elt t, 3),
   //                   (extract_elt t, 5),
   //                   (extract_elt t, 7)))
 
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
       Op1.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
       VT.getVectorNumElements() < 4 ||
       !isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
 
   // Check if one of Op0,Op1 is of the form:
   // (build_vector (extract_elt Mul, 0),
   //               (extract_elt Mul, 2),
   //               (extract_elt Mul, 4),
   //                   ...
   // the other is of the form:
   // (build_vector (extract_elt Mul, 1),
   //               (extract_elt Mul, 3),
   //               (extract_elt Mul, 5),
   //                   ...
   // and identify Mul.
   SDValue Mul;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
     // TODO: Be more tolerant to undefs.
     if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
     auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
     auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
     auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
     auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
     if (!Const0L || !Const1L || !Const0H || !Const1H)
       return SDValue();
     unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
              Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
     // Commutativity of mul allows factors of a product to reorder.
     if (Idx0L > Idx1L)
       std::swap(Idx0L, Idx1L);
     if (Idx0H > Idx1H)
       std::swap(Idx0H, Idx1H);
     // Commutativity of add allows pairs of factors to reorder.
     if (Idx0L > Idx0H) {
       std::swap(Idx0L, Idx0H);
       std::swap(Idx1L, Idx1H);
     }
     if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
         Idx1H != 2 * i + 3)
       return SDValue();
     if (!Mul) {
       // First time an extract_elt's source vector is visited. Must be a MUL
       // with 2X number of vector elements than the BUILD_VECTOR.
       // Both extracts must be from same MUL.
       Mul = Op0L->getOperand(0);
       if (Mul->getOpcode() != ISD::MUL ||
           Mul.getValueType().getVectorNumElements() != 2 * e)
         return SDValue();
     }
     // Check that the extract is from the same MUL previously seen.
     if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
         Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
       return SDValue();
   }
 
   // Check if the Mul source can be safely shrunk.
   ShrinkMode Mode;
   if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
     return SDValue();
 
   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                          ArrayRef<SDValue> Ops) {
     // Shrink by adding truncate nodes and let DAGCombine fold with the
     // sources.
     EVT InVT = Ops[0].getValueType();
     assert(InVT.getScalarType() == MVT::i32 &&
            "Unexpected scalar element type");
     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements() / 2);
     EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                    InVT.getVectorNumElements());
     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
                        DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
                        DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
   };
   return SplitOpsAndApply(DAG, Subtarget, DL, VT,
                           { Mul.getOperand(0), Mul.getOperand(1) },
                           PMADDBuilder);
 }
 
 // Try to turn (add (umax X, C), -C) into (psubus X, C)
 static SDValue combineAddToSUBUS(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   // psubus is available in SSE2 for i8 and i16 vectors.
   if (!VT.isVector() || VT.getVectorNumElements() < 2 ||
       !isPowerOf2_32(VT.getVectorNumElements()) ||
       !(VT.getVectorElementType() == MVT::i8 ||
         VT.getVectorElementType() == MVT::i16))
     return SDValue();
 
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   if (Op0.getOpcode() != ISD::UMAX)
     return SDValue();
 
   // The add should have a constant that is the negative of the max.
   // TODO: Handle build_vectors with undef elements.
   auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
     return Max->getAPIntValue() == (-Op->getAPIntValue());
   };
   if (!ISD::matchBinaryPredicate(Op0.getOperand(1), Op1, MatchUSUBSAT))
     return SDValue();
 
   SDLoc DL(N);
   return DAG.getNode(ISD::USUBSAT, DL, VT, Op0.getOperand(0),
                      Op0.getOperand(1));
 }
 
 // Attempt to turn this pattern into PMADDWD.
 // (mul (add (zext (build_vector)), (zext (build_vector))),
 //      (add (zext (build_vector)), (zext (build_vector)))
 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                               const SDLoc &DL, EVT VT,
                               const X86Subtarget &Subtarget) {
   if (!Subtarget.hasSSE2())
     return SDValue();
 
   if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
     return SDValue();
 
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
       VT.getVectorNumElements() < 4 ||
       !isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
   SDValue N10 = N1.getOperand(0);
   SDValue N11 = N1.getOperand(1);
 
   // All inputs need to be sign extends.
   // TODO: Support ZERO_EXTEND from known positive?
   if (N00.getOpcode() != ISD::SIGN_EXTEND ||
       N01.getOpcode() != ISD::SIGN_EXTEND ||
       N10.getOpcode() != ISD::SIGN_EXTEND ||
       N11.getOpcode() != ISD::SIGN_EXTEND)
     return SDValue();
 
   // Peek through the extends.
   N00 = N00.getOperand(0);
   N01 = N01.getOperand(0);
   N10 = N10.getOperand(0);
   N11 = N11.getOperand(0);
 
   // Must be extending from vXi16.
   EVT InVT = N00.getValueType();
   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
       N10.getValueType() != InVT || N11.getValueType() != InVT)
     return SDValue();
 
   // All inputs should be build_vectors.
   if (N00.getOpcode() != ISD::BUILD_VECTOR ||
       N01.getOpcode() != ISD::BUILD_VECTOR ||
       N10.getOpcode() != ISD::BUILD_VECTOR ||
       N11.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
   // For each element, we need to ensure we have an odd element from one vector
   // multiplied by the odd element of another vector and the even element from
   // one of the same vectors being multiplied by the even element from the
   // other vector. So we need to make sure for each element i, this operator
   // is being performed:
   //  A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
   SDValue In0, In1;
   for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
     SDValue N00Elt = N00.getOperand(i);
     SDValue N01Elt = N01.getOperand(i);
     SDValue N10Elt = N10.getOperand(i);
     SDValue N11Elt = N11.getOperand(i);
     // TODO: Be more tolerant to undefs.
     if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       return SDValue();
     auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
     auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
     auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
     auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
     if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
       return SDValue();
     unsigned IdxN00 = ConstN00Elt->getZExtValue();
     unsigned IdxN01 = ConstN01Elt->getZExtValue();
     unsigned IdxN10 = ConstN10Elt->getZExtValue();
     unsigned IdxN11 = ConstN11Elt->getZExtValue();
     // Add is commutative so indices can be reordered.
     if (IdxN00 > IdxN10) {
       std::swap(IdxN00, IdxN10);
       std::swap(IdxN01, IdxN11);
     }
     // N0 indices be the even element. N1 indices must be the next odd element.
     if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
         IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
       return SDValue();
     SDValue N00In = N00Elt.getOperand(0);
     SDValue N01In = N01Elt.getOperand(0);
     SDValue N10In = N10Elt.getOperand(0);
     SDValue N11In = N11Elt.getOperand(0);
     // First time we find an input capture it.
     if (!In0) {
       In0 = N00In;
       In1 = N01In;
     }
     // Mul is commutative so the input vectors can be in any order.
     // Canonicalize to make the compares easier.
     if (In0 != N00In)
       std::swap(N00In, N01In);
     if (In0 != N10In)
       std::swap(N10In, N11In);
     if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
       return SDValue();
   }
 
   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                          ArrayRef<SDValue> Ops) {
     // Shrink by adding truncate nodes and let DAGCombine fold with the
     // sources.
     EVT InVT = Ops[0].getValueType();
     assert(InVT.getScalarType() == MVT::i16 &&
            "Unexpected scalar element type");
     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements() / 2);
     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
   };
   return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
                           PMADDBuilder);
 }
 
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   const SDNodeFlags Flags = N->getFlags();
   if (Flags.hasVectorReduction()) {
     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
       return Sad;
     if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
       return MAdd;
   }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
     return MAdd;
   if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
     return MAdd;
 
   // Try to synthesize horizontal adds from adds of shuffles.
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
       Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) &&
       shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
     auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
     };
     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
                             HADDBuilder);
   }
 
   if (SDValue V = combineIncDecVector(N, DAG))
     return V;
 
   if (SDValue V = combineAddToSUBUS(N, DAG, Subtarget))
     return V;
 
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   // PSUBUS is supported, starting from SSE2, but truncation for v8i32
   // is only worth it with SSSE3 (PSHUFB).
   if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
       !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
       !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
       !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
                                    VT == MVT::v16i32 || VT == MVT::v8i64)))
     return SDValue();
 
   SDValue SubusLHS, SubusRHS;
   // Try to find umax(a,b) - b or a - umin(a,b) patterns
   // they may be converted to subus(a,b).
   // TODO: Need to add IR canonicalization for this code.
   if (Op0.getOpcode() == ISD::UMAX) {
     SubusRHS = Op1;
     SDValue MaxLHS = Op0.getOperand(0);
     SDValue MaxRHS = Op0.getOperand(1);
     if (MaxLHS == Op1)
       SubusLHS = MaxRHS;
     else if (MaxRHS == Op1)
       SubusLHS = MaxLHS;
     else
       return SDValue();
   } else if (Op1.getOpcode() == ISD::UMIN) {
     SubusLHS = Op0;
     SDValue MinLHS = Op1.getOperand(0);
     SDValue MinRHS = Op1.getOperand(1);
     if (MinLHS == Op0)
       SubusRHS = MinRHS;
     else if (MinRHS == Op0)
       SubusRHS = MinLHS;
     else
       return SDValue();
   } else
     return SDValue();
 
   auto USUBSATBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                            ArrayRef<SDValue> Ops) {
     return DAG.getNode(ISD::USUBSAT, DL, Ops[0].getValueType(), Ops);
   };
 
   // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
   // special preprocessing in some cases.
   if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
                             { SubusLHS, SubusRHS }, USUBSATBuilder);
 
   // Special preprocessing case can be only applied
   // if the value was zero extended from 16 bit,
   // so we require first 16 bits to be zeros for 32 bit
   // values, or first 48 bits for 64 bit values.
   KnownBits Known = DAG.computeKnownBits(SubusLHS);
   unsigned NumZeros = Known.countMinLeadingZeros();
   if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
     return SDValue();
 
   EVT ExtType = SubusLHS.getValueType();
   EVT ShrinkedType;
   if (VT == MVT::v8i32 || VT == MVT::v8i64)
     ShrinkedType = MVT::v8i16;
   else
     ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
 
   // If SubusLHS is zeroextended - truncate SubusRHS to it's
   // size SubusRHS = umin(0xFFF.., SubusRHS).
   SDValue SaturationConst =
       DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
                                            ShrinkedType.getScalarSizeInBits()),
                       SDLoc(SubusLHS), ExtType);
   SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
                              SaturationConst);
   SDValue NewSubusLHS =
       DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
   SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
   SDValue Psubus =
       SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
                        { NewSubusLHS, NewSubusRHS }, USUBSATBuilder);
   // Zero extend the result, it may be used somewhere as 32 bit,
   // if not zext and following trunc will shrink.
   return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
 }
 
 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
   // X86 can't encode an immediate LHS of a sub. See if we can push the
   // negation into a preceding instruction.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
     // If the RHS of the sub is a XOR with one use and a constant, invert the
     // immediate. Then add one to the LHS of the sub so we can turn
     // X-Y -> X+~Y+1, saving one register.
     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
         isa<ConstantSDNode>(Op1.getOperand(1))) {
       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
       EVT VT = Op0.getValueType();
       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
                                    Op1.getOperand(0),
                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
     }
   }
 
   // Try to synthesize horizontal subs from subs of shuffles.
   EVT VT = N->getValueType(0);
   if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
        VT == MVT::v8i32) &&
       Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) &&
       shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) {
     auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                           ArrayRef<SDValue> Ops) {
       return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
     };
     return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
                             HSUBBuilder);
   }
 
   if (SDValue V = combineIncDecVector(N, DAG))
     return V;
 
   // Try to create PSUBUS if SUB's argument is max/min
   if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
     return V;
 
   return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   MVT VT = N->getSimpleValueType(0);
   SDLoc DL(N);
 
   if (N->getOperand(0) == N->getOperand(1)) {
     if (N->getOpcode() == X86ISD::PCMPEQ)
       return DAG.getConstant(-1, DL, VT);
     if (N->getOpcode() == X86ISD::PCMPGT)
       return DAG.getConstant(0, DL, VT);
   }
 
   return SDValue();
 }
 
 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const X86Subtarget &Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   MVT OpVT = N->getSimpleValueType(0);
 
   bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
 
   SDLoc dl(N);
   SDValue Vec = N->getOperand(0);
   SDValue SubVec = N->getOperand(1);
 
   unsigned IdxVal = N->getConstantOperandVal(2);
   MVT SubVecVT = SubVec.getSimpleValueType();
 
   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
     // Inserting zeros into zeros is a nop.
     if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
       return getZeroVector(OpVT, Subtarget, DAG, dl);
 
     // If we're inserting into a zero vector and then into a larger zero vector,
     // just insert into the larger zero vector directly.
     if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
         ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
       unsigned Idx2Val = SubVec.getConstantOperandVal(2);
       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
                          getZeroVector(OpVT, Subtarget, DAG, dl),
                          SubVec.getOperand(1),
                          DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
     }
 
     // If we're inserting into a zero vector and our input was extracted from an
     // insert into a zero vector of the same type and the extraction was at
     // least as large as the original insertion. Just insert the original
     // subvector into a zero vector.
     if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
         SubVec.getConstantOperandVal(1) == 0 &&
         SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
       SDValue Ins = SubVec.getOperand(0);
       if (Ins.getConstantOperandVal(2) == 0 &&
           ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
           Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
                            getZeroVector(OpVT, Subtarget, DAG, dl),
                            Ins.getOperand(1), N->getOperand(2));
     }
 
     // If we're inserting a bitcast into zeros, rewrite the insert and move the
     // bitcast to the other side. This helps with detecting zero extending
     // during isel.
     // TODO: Is this useful for other indices than 0?
     if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
       MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
       unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
       MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
       SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
                                    DAG.getBitcast(NewVT, Vec),
                                    SubVec.getOperand(0), N->getOperand(2));
       return DAG.getBitcast(OpVT, Insert);
     }
   }
 
   // Stop here if this is an i1 vector.
   if (IsI1Vector)
     return SDValue();
 
   // If this is an insert of an extract, combine to a shuffle. Don't do this
   // if the insert or extract can be represented with a subregister operation.
   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
       (IdxVal != 0 || !Vec.isUndef())) {
     int ExtIdxVal = SubVec.getConstantOperandVal(1);
     if (ExtIdxVal != 0) {
       int VecNumElts = OpVT.getVectorNumElements();
       int SubVecNumElts = SubVecVT.getVectorNumElements();
       SmallVector<int, 64> Mask(VecNumElts);
       // First create an identity shuffle mask.
       for (int i = 0; i != VecNumElts; ++i)
         Mask[i] = i;
       // Now insert the extracted portion.
       for (int i = 0; i != SubVecNumElts; ++i)
         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
 
       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
     }
   }
 
   // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
   // load:
   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
   //                   (load16 addr + 16), Elts/2)
   // --> load32 addr
   // or:
   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
   //                   (load32 addr + 32), Elts/2)
   // --> load64 addr
   // or a 16-byte or 32-byte broadcast:
   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
   //                   (load16 addr), Elts/2)
   // --> X86SubVBroadcast(load16 addr)
   // or:
   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
   //                   (load32 addr), Elts/2)
   // --> X86SubVBroadcast(load32 addr)
   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
       OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
     if (isNullConstant(Vec.getOperand(2))) {
       SDValue SubVec2 = Vec.getOperand(1);
       // If needed, look through bitcasts to get to the load.
       if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
         bool Fast;
         unsigned Alignment = FirstLd->getAlignment();
         unsigned AS = FirstLd->getAddressSpace();
         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
                                     OpVT, AS, Alignment, &Fast) && Fast) {
           SDValue Ops[] = {SubVec2, SubVec};
           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
                                                     Subtarget, false))
             return Ld;
         }
       }
       // If lower/upper loads are the same and there's no other use of the lower
       // load, then splat the loaded value with a broadcast.
       if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
         if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && Vec.hasOneUse())
           return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
 
       // If this is subv_broadcast insert into both halves, use a larger
       // subv_broadcast.
       if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
         return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
                            SubVec.getOperand(0));
 
       // If we're inserting all zeros into the upper half, change this to
       // an insert into an all zeros vector. We will match this to a move
       // with implicit upper bit zeroing during isel.
       if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
                            getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
                            Vec.getOperand(2));
 
       // If we are inserting into both halves of the vector, the starting
       // vector should be undef. If it isn't, make it so. Only do this if the
       // the early insert has no other uses.
       // TODO: Should this be a generic DAG combine?
       if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
         Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
                           SubVec2, Vec.getOperand(2));
         return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
                            N->getOperand(2));
 
       }
     }
   }
 
   return SDValue();
 }
 
 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const X86Subtarget &Subtarget) {
   // For AVX1 only, if we are extracting from a 256-bit and+not (which will
   // eventually get combined/lowered into ANDNP) with a concatenated operand,
   // split the 'and' into 128-bit ops to avoid the concatenate and extract.
   // We let generic combining take over from there to simplify the
   // insert/extract and 'not'.
   // This pattern emerges during AVX1 legalization. We handle it before lowering
   // to avoid complications like splitting constant vector loads.
 
   // Capture the original wide type in the likely case that we need to bitcast
   // back to this type.
   EVT VT = N->getValueType(0);
   EVT WideVecVT = N->getOperand(0).getValueType();
   SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
       TLI.isTypeLegal(WideVecVT) &&
       WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
     auto isConcatenatedNot = [] (SDValue V) {
       V = peekThroughBitcasts(V);
       if (!isBitwiseNot(V))
         return false;
       SDValue NotOp = V->getOperand(0);
       return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
     };
     if (isConcatenatedNot(WideVec.getOperand(0)) ||
         isConcatenatedNot(WideVec.getOperand(1))) {
       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
       SDValue Concat = split256IntArith(WideVec, DAG);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
                          DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
     }
   }
 
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   MVT OpVT = N->getSimpleValueType(0);
   SDValue InVec = N->getOperand(0);
   unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
 
   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
     return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
 
   if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
     if (OpVT.getScalarType() == MVT::i1)
       return DAG.getConstant(1, SDLoc(N), OpVT);
     return getOnesVector(OpVT, DAG, SDLoc(N));
   }
 
   if (InVec.getOpcode() == ISD::BUILD_VECTOR)
     return DAG.getBuildVector(
         OpVT, SDLoc(N),
         InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
 
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
   if (IdxVal == 0 && InVec.hasOneUse()) {
     unsigned InOpcode = InVec.getOpcode();
     if (OpVT == MVT::v2f64 && InVec.getValueType() == MVT::v4f64) {
       // v2f64 CVTDQ2PD(v4i32).
       if (InOpcode == ISD::SINT_TO_FP &&
           InVec.getOperand(0).getValueType() == MVT::v4i32) {
         return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), OpVT, InVec.getOperand(0));
       }
       // v2f64 CVTPS2PD(v4f32).
       if (InOpcode == ISD::FP_EXTEND &&
           InVec.getOperand(0).getValueType() == MVT::v4f32) {
         return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), OpVT, InVec.getOperand(0));
       }
     }
     if ((InOpcode == ISD::ZERO_EXTEND || InOpcode == ISD::SIGN_EXTEND) &&
         OpVT.is128BitVector() &&
         InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
       unsigned ExtOp =
         InOpcode == ISD::ZERO_EXTEND ? ISD::ZERO_EXTEND_VECTOR_INREG
                                      : ISD::SIGN_EXTEND_VECTOR_INREG;
       return DAG.getNode(ExtOp, SDLoc(N), OpVT, InVec.getOperand(0));
     }
     if ((InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
          InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&
         OpVT.is128BitVector() &&
         InVec.getOperand(0).getSimpleValueType().is128BitVector()) {
       return DAG.getNode(InOpcode, SDLoc(N), OpVT, InVec.getOperand(0));
     }
     if (InOpcode == ISD::BITCAST) {
       // TODO - do this for target shuffles in general.
       SDValue InVecBC = peekThroughOneUseBitcasts(InVec);
       if (InVecBC.getOpcode() == X86ISD::PSHUFB && OpVT.is128BitVector()) {
         SDLoc DL(N);
         SDValue SubPSHUFB =
             DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
                         extract128BitVector(InVecBC.getOperand(0), 0, DAG, DL),
                         extract128BitVector(InVecBC.getOperand(1), 0, DAG, DL));
         return DAG.getBitcast(OpVT, SubPSHUFB);
       }
     }
   }
 
   return SDValue();
 }
 
 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
 
   // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
   // This occurs frequently in our masked scalar intrinsic code and our
   // floating point select lowering with AVX512.
   // TODO: SimplifyDemandedBits instead?
   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
       if (C->getAPIntValue().isOneValue())
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
                            Src.getOperand(0));
 
   // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
   if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
       Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)
     if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
       if (C->isNullValue())
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
                            Src.getOperand(0), Src.getOperand(1));
 
   return SDValue();
 }
 
 // Simplify PMULDQ and PMULUDQ operations.
 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
   // Canonicalize constant to RHS.
   if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&
       !DAG.isConstantIntBuildVectorOrConstantInt(RHS))
     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
 
   // Multiply by zero.
   if (ISD::isBuildVectorAllZeros(RHS.getNode()))
     return RHS;
 
   // Aggressively peek through ops to get at the demanded low bits.
   APInt DemandedMask = APInt::getLowBitsSet(64, 32);
   SDValue DemandedLHS = DAG.GetDemandedBits(LHS, DemandedMask);
   SDValue DemandedRHS = DAG.GetDemandedBits(RHS, DemandedMask);
   if (DemandedLHS || DemandedRHS)
     return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
                        DemandedLHS ? DemandedLHS : LHS,
                        DemandedRHS ? DemandedRHS : RHS);
 
   // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default: break;
   case ISD::SCALAR_TO_VECTOR:
     return combineScalarToVector(N, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
   case X86ISD::PEXTRW:
   case X86ISD::PEXTRB:
     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
   case ISD::INSERT_SUBVECTOR:
     return combineInsertSubvector(N, DAG, DCI, Subtarget);
   case ISD::EXTRACT_SUBVECTOR:
     return combineExtractSubvector(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
   case ISD::SELECT:
   case X86ISD::BLENDV:      return combineSelect(N, DAG, DCI, Subtarget);
   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
   case X86ISD::CMP:         return combineCMP(N, DAG);
   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
   case X86ISD::SBB:         return combineSBB(N, DAG);
   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
   case X86ISD::BEXTR:       return combineBEXTR(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
   case ISD::FADD:
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
   case X86ISD::FMIN:
   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
   case ISD::FMINNUM:
   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
   case X86ISD::CVTSI2P:  
   case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
   case X86ISD::BT:          return combineBT(N, DAG, DCI);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
   case X86ISD::PACKSS:
   case X86ISD::PACKUS:      return combineVectorPack(N, DAG, DCI, Subtarget);
   case X86ISD::VSHL:
   case X86ISD::VSRA:
   case X86ISD::VSRL:
     return combineVectorShiftVar(N, DAG, DCI, Subtarget);
   case X86ISD::VSHLI:
   case X86ISD::VSRAI:
   case X86ISD::VSRLI:
     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
   case X86ISD::PINSRB:
   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::INSERTPS:
   case X86ISD::EXTRQI:
   case X86ISD::INSERTQI:
   case X86ISD::PALIGNR:
   case X86ISD::VSHLDQ:
   case X86ISD::VSRLDQ:
   case X86ISD::BLENDI:
   case X86ISD::UNPCKH:
   case X86ISD::UNPCKL:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVLHPS:
   case X86ISD::PSHUFB:
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSHDUP:
   case X86ISD::MOVSLDUP:
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
   case X86ISD::VBROADCAST:
   case X86ISD::VPPERM:
   case X86ISD::VPERMI:
   case X86ISD::VPERMV:
   case X86ISD::VPERMV3:
   case X86ISD::VPERMIL2:
   case X86ISD::VPERMILPI:
   case X86ISD::VPERMILPV:
   case X86ISD::VPERM2X128:
   case X86ISD::SHUF128:
   case X86ISD::VZEXT_MOVL:
   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
   case X86ISD::FMADD_RND:
   case X86ISD::FMSUB:
   case X86ISD::FMSUB_RND:
   case X86ISD::FNMADD:
   case X86ISD::FNMADD_RND:
   case X86ISD::FNMSUB:
   case X86ISD::FNMSUB_RND:
   case ISD::FMA: return combineFMA(N, DAG, Subtarget);
   case X86ISD::FMADDSUB_RND:
   case X86ISD::FMSUBADD_RND:
   case X86ISD::FMADDSUB:
   case X86ISD::FMSUBADD:    return combineFMADDSUB(N, DAG, Subtarget);
   case X86ISD::MOVMSK:      return combineMOVMSK(N, DAG, DCI);
   case X86ISD::MGATHER:
   case X86ISD::MSCATTER:
   case ISD::MGATHER:
   case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI, Subtarget);
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   case X86ISD::PMULDQ:
   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI);
   }
 
   return SDValue();
 }
 
 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
   if (!isTypeLegal(VT))
     return false;
 
   // There are no vXi8 shifts.
   if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
     return false;
 
   // 8-bit multiply is probably not much cheaper than 32-bit multiply, and
   // we have specializations to turn 32-bit multiply into LEA or other ops.
   // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
   // check for a constant operand to the multiply.
   if (Opc == ISD::MUL && VT == MVT::i8)
     return false;
 
   // i16 instruction encodings are longer and some i16 instructions are slow,
   // so those are not desirable.
   if (VT == MVT::i16) {
     switch (Opc) {
     default:
       break;
     case ISD::LOAD:
     case ISD::SIGN_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::ANY_EXTEND:
     case ISD::SHL:
     case ISD::SRL:
     case ISD::SUB:
     case ISD::ADD:
     case ISD::MUL:
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
       return false;
     }
   }
 
   // Any legal type not explicitly accounted for above here is desirable.
   return true;
 }
 
 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
                                                   SDValue Value, SDValue Addr,
                                                   SelectionDAG &DAG) const {
   const Module *M = DAG.getMachineFunction().getMMI().getModule();
   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
   if (IsCFProtectionSupported) {
     // In case control-flow branch protection is enabled, we need to add
     // notrack prefix to the indirect branch.
     // In order to do that we create NT_BRIND SDNode.
     // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
     return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
   }
 
   return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
 }
 
 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   EVT VT = Op.getValueType();
   bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
                              isa<ConstantSDNode>(Op.getOperand(1));
 
   // i16 is legal, but undesirable since i16 instruction encodings are longer
   // and some i16 instructions are slow.
   // 8-bit multiply-by-constant can usually be expanded to something cheaper
   // using LEA and/or other ALU ops.
   if (VT != MVT::i16 && !Is8BitMulByConstant)
     return false;
 
   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
     if (!Op.hasOneUse())
       return false;
     SDNode *User = *Op->use_begin();
     if (!ISD::isNormalStore(User))
       return false;
     auto *Ld = cast<LoadSDNode>(Load);
     auto *St = cast<StoreSDNode>(User);
     return Ld->getBasePtr() == St->getBasePtr();
   };
 
   auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
     if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
       return false;
     if (!Op.hasOneUse())
       return false;
     SDNode *User = *Op->use_begin();
     if (User->getOpcode() != ISD::ATOMIC_STORE)
       return false;
     auto *Ld = cast<AtomicSDNode>(Load);
     auto *St = cast<AtomicSDNode>(User);
     return Ld->getBasePtr() == St->getBasePtr();
   };
 
   bool Commute = false;
   switch (Op.getOpcode()) {
   default: return false;
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
     break;
   case ISD::SHL:
   case ISD::SRL: {
     SDValue N0 = Op.getOperand(0);
     // Look out for (store (shl (load), x)).
     if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
       return false;
     break;
   }
   case ISD::ADD:
   case ISD::MUL:
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR:
     Commute = true;
     LLVM_FALLTHROUGH;
   case ISD::SUB: {
     SDValue N0 = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
     // Avoid disabling potential load folding opportunities.
     if (MayFoldLoad(N1) &&
         (!Commute || !isa<ConstantSDNode>(N0) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
       return false;
     if (MayFoldLoad(N0) &&
         ((Commute && !isa<ConstantSDNode>(N1)) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
       return false;
     if (IsFoldableAtomicRMW(N0, Op) ||
         (Commute && IsFoldableAtomicRMW(N1, Op)))
       return false;
   }
   }
 
   PVT = MVT::i32;
   return true;
 }
 
 bool X86TargetLowering::
     isDesirableToCombineBuildVectorToShuffleTruncate(
         ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
 
   assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
          "Element count mismatch");
   assert(
       Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
       "Shuffle Mask expected to be legal");
 
   // For 32-bit elements VPERMD is better than shuffle+truncate.
   // TODO: After we improve lowerBuildVector, add execption for VPERMW.
   if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
     return false;
 
   if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
     return false;
 
   return true;
 }
 
 //===----------------------------------------------------------------------===//
 //                           X86 Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
 // Helper to match a string separated by whitespace.
 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
 
   for (StringRef Piece : Pieces) {
     if (!S.startswith(Piece)) // Check if the piece matches.
       return false;
 
     S = S.substr(Piece.size());
     StringRef::size_type Pos = S.find_first_not_of(" \t");
     if (Pos == 0) // We matched a prefix.
       return false;
 
     S = S.substr(Pos);
   }
 
   return S.empty();
 }
 
 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
 
   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
 
       if (AsmPieces.size() == 3)
         return true;
       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
         return true;
     }
   }
   return false;
 }
 
 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
 
   const std::string &AsmStr = IA->getAsmString();
 
   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
   if (!Ty || Ty->getBitWidth() % 16 != 0)
     return false;
 
   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
   SmallVector<StringRef, 4> AsmPieces;
   SplitString(AsmStr, AsmPieces, ";\n");
 
   switch (AsmPieces.size()) {
   default: return false;
   case 1:
     // FIXME: this should verify that we are targeting a 486 or better.  If not,
     // we will turn this bswap into something that will be lowered to logical
     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
     // lower so don't worry about this.
     // bswap $0
     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
       // No need to check constraints, nothing other than the equivalent of
       // "=r,0" would be valid here.
       return IntrinsicLowering::LowerToByteSwap(CI);
     }
 
     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
     if (CI->getType()->isIntegerTy(16) &&
         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
       AsmPieces.clear();
       StringRef ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (clobbersFlagRegisters(AsmPieces))
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
     break;
   case 3:
     if (CI->getType()->isIntegerTy(32) &&
         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
       AsmPieces.clear();
       StringRef ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
       if (clobbersFlagRegisters(AsmPieces))
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
 
     if (CI->getType()->isIntegerTy(64)) {
       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
       if (Constraints.size() >= 2 &&
           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
           return IntrinsicLowering::LowerToByteSwap(CI);
       }
     }
     break;
   }
   return false;
 }
 
 /// Given a constraint letter, return the type of constraint for this target.
 X86TargetLowering::ConstraintType
 X86TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'R':
     case 'q':
     case 'Q':
     case 'f':
     case 't':
     case 'u':
     case 'y':
     case 'x':
     case 'v':
     case 'Y':
     case 'l':
     case 'k': // AVX512 masking registers.
       return C_RegisterClass;
     case 'a':
     case 'b':
     case 'c':
     case 'd':
     case 'S':
     case 'D':
     case 'A':
       return C_Register;
     case 'I':
     case 'J':
     case 'K':
     case 'L':
     case 'M':
     case 'N':
     case 'G':
     case 'C':
     case 'e':
     case 'Z':
       return C_Other;
     default:
       break;
     }
   }
   else if (Constraint.size() == 2) {
     switch (Constraint[0]) {
     default:
       break;
     case 'Y':
       switch (Constraint[1]) {
       default:
         break;
       case 'z':
       case '0':
         return C_Register;
       case 'i':
       case 'm':
       case 'k':
       case 't':
       case '2':
         return C_RegisterClass;
       }
     }
   }
   return TargetLowering::getConstraintType(Constraint);
 }
 
 /// Examine constraint type and operand type and determine a weight value.
 /// This object must already have been set up with the operand type
 /// and the current alternative constraint selected.
 TargetLowering::ConstraintWeight
   X86TargetLowering::getSingleConstraintMatchWeight(
     AsmOperandInfo &info, const char *constraint) const {
   ConstraintWeight weight = CW_Invalid;
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
   if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
     LLVM_FALLTHROUGH;
   case 'R':
   case 'q':
   case 'Q':
   case 'a':
   case 'b':
   case 'c':
   case 'd':
   case 'S':
   case 'D':
   case 'A':
     if (CallOperandVal->getType()->isIntegerTy())
       weight = CW_SpecificReg;
     break;
   case 'f':
   case 't':
   case 'u':
     if (type->isFloatingPointTy())
       weight = CW_SpecificReg;
     break;
   case 'y':
     if (type->isX86_MMXTy() && Subtarget.hasMMX())
       weight = CW_SpecificReg;
     break;
   case 'Y': {
     unsigned Size = StringRef(constraint).size();
     // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
     char NextChar = Size == 2 ? constraint[1] : 'i';
     if (Size > 2)
       break;
     switch (NextChar) {
       default:
         return CW_Invalid;
       // XMM0
       case 'z':
       case '0':
         if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
           return CW_SpecificReg;
         return CW_Invalid;
       // Conditional OpMask regs (AVX512)
       case 'k':
         if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
           return CW_Register;
         return CW_Invalid;
       // Any MMX reg
       case 'm':
         if (type->isX86_MMXTy() && Subtarget.hasMMX())
           return weight;
         return CW_Invalid;
       // Any SSE reg when ISA >= SSE2, same as 'Y'
       case 'i':
       case 't':
       case '2':
         if (!Subtarget.hasSSE2())
           return CW_Invalid;
         break;
     }
     // Fall through (handle "Y" constraint).
     LLVM_FALLTHROUGH;
   }
   case 'v':
     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
       weight = CW_Register;
     LLVM_FALLTHROUGH;
   case 'x':
     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
       weight = CW_Register;
     break;
   case 'k':
     // Enable conditional vector operations using %k<#> registers.
     if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
       weight = CW_Register;
     break;
   case 'I':
     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
       if (C->getZExtValue() <= 31)
         weight = CW_Constant;
     }
     break;
   case 'J':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 63)
         weight = CW_Constant;
     }
     break;
   case 'K':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
         weight = CW_Constant;
     }
     break;
   case 'L':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
         weight = CW_Constant;
     }
     break;
   case 'M':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 3)
         weight = CW_Constant;
     }
     break;
   case 'N':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 0xff)
         weight = CW_Constant;
     }
     break;
   case 'G':
   case 'C':
     if (isa<ConstantFP>(CallOperandVal)) {
       weight = CW_Constant;
     }
     break;
   case 'e':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if ((C->getSExtValue() >= -0x80000000LL) &&
           (C->getSExtValue() <= 0x7fffffffLL))
         weight = CW_Constant;
     }
     break;
   case 'Z':
     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
       if (C->getZExtValue() <= 0xffffffff)
         weight = CW_Constant;
     }
     break;
   }
   return weight;
 }
 
 /// Try to replace an X constraint, which matches anything, with another that
 /// has more specific requirements based on the type of the corresponding
 /// operand.
 const char *X86TargetLowering::
 LowerXConstraint(EVT ConstraintVT) const {
   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   // 'f' like normal targets.
   if (ConstraintVT.isFloatingPoint()) {
     if (Subtarget.hasSSE2())
       return "Y";
     if (Subtarget.hasSSE1())
       return "x";
   }
 
   return TargetLowering::LowerXConstraint(ConstraintVT);
 }
 
 /// Lower the specified operand into the Ops vector.
 /// If it is invalid, don't add anything to Ops.
 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result;
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1) return;
 
   char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
   default: break;
   case 'I':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 31) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'J':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 63) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'K':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (isInt<8>(C->getSExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'L':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'M':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 3) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'N':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 255) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'O':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->getZExtValue() <= 127) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     return;
   case 'e': {
     // 32-bit signed value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
                                            C->getSExtValue())) {
         // Widen to 64 bits here to get it sign extended.
         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
         break;
       }
     // FIXME gcc accepts some relocatable values here too, but only in certain
     // memory models; it's complicated.
     }
     return;
   }
   case 'Z': {
     // 32-bit unsigned value
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
                                            C->getZExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
                                        Op.getValueType());
         break;
       }
     }
     // FIXME gcc accepts some relocatable values here too, but only in certain
     // memory models; it's complicated.
     return;
   }
   case 'i': {
     // Literal immediates are always ok.
     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
       // Widen to 64 bits here to get it sign extended.
       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
       break;
     }
 
     // In any sort of PIC mode addresses need to be computed at runtime by
     // adding in a register or some sort of table lookup.  These can't
     // be used as immediates.
     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
       return;
 
     // If we are in non-pic codegen mode, we allow the address of a global (with
     // an optional displacement) to be used with 'i'.
     GlobalAddressSDNode *GA = nullptr;
     int64_t Offset = 0;
 
     // Match either (GA), (GA+C), (GA+C1+C2), etc.
     while (1) {
       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
         Offset += GA->getOffset();
         break;
       } else if (Op.getOpcode() == ISD::ADD) {
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
           Offset += C->getZExtValue();
           Op = Op.getOperand(0);
           continue;
         }
       } else if (Op.getOpcode() == ISD::SUB) {
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
           Offset += -C->getZExtValue();
           Op = Op.getOperand(0);
           continue;
         }
       }
 
       // Otherwise, this isn't something we can handle, reject it.
       return;
     }
 
     const GlobalValue *GV = GA->getGlobal();
     // If we require an extra load to get this address, as in PIC mode, we
     // can't accept it.
     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
       return;
 
     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
                                         GA->getValueType(0), Offset);
     break;
   }
   }
 
   if (Result.getNode()) {
     Ops.push_back(Result);
     return;
   }
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 /// Check if \p RC is a general purpose register class.
 /// I.e., GR* or one of their variant.
 static bool isGRClass(const TargetRegisterClass &RC) {
   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
          RC.hasSuperClassEq(&X86::GR16RegClass) ||
          RC.hasSuperClassEq(&X86::GR32RegClass) ||
          RC.hasSuperClassEq(&X86::GR64RegClass) ||
          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
 }
 
 /// Check if \p RC is a vector register class.
 /// I.e., FR* / VR* or one of their variant.
 static bool isFRClass(const TargetRegisterClass &RC) {
   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
          RC.hasSuperClassEq(&X86::VR512RegClass);
 }
 
 std::pair<unsigned, const TargetRegisterClass *>
 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                 StringRef Constraint,
                                                 MVT VT) const {
   // First, see if this is a constraint that directly corresponds to an LLVM
   // register class.
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
     default: break;
       // TODO: Slight differences here in allocation order and leaving
       // RIP in the class. Do they matter any more here than they do
       // in the normal allocation?
     case 'k':
       if (Subtarget.hasAVX512()) {
         //  Only supported in AVX512 or later.
         switch (VT.SimpleTy) {
         default: break;
         case MVT::i32:
           return std::make_pair(0U, &X86::VK32RegClass);
         case MVT::i16:
           return std::make_pair(0U, &X86::VK16RegClass);
         case MVT::i8:
           return std::make_pair(0U, &X86::VK8RegClass);
         case MVT::i1:
           return std::make_pair(0U, &X86::VK1RegClass);
         case MVT::i64:
           return std::make_pair(0U, &X86::VK64RegClass);
         }
       }
       break;
     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
       if (Subtarget.is64Bit()) {
         if (VT == MVT::i32 || VT == MVT::f32)
           return std::make_pair(0U, &X86::GR32RegClass);
         if (VT == MVT::i16)
           return std::make_pair(0U, &X86::GR16RegClass);
         if (VT == MVT::i8 || VT == MVT::i1)
           return std::make_pair(0U, &X86::GR8RegClass);
         if (VT == MVT::i64 || VT == MVT::f64)
           return std::make_pair(0U, &X86::GR64RegClass);
         break;
       }
       LLVM_FALLTHROUGH;
       // 32-bit fallthrough
     case 'Q':   // Q_REGS
       if (VT == MVT::i32 || VT == MVT::f32)
         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
       if (VT == MVT::i64)
         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
       break;
     case 'r':   // GENERAL_REGS
     case 'l':   // INDEX_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8RegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16RegClass);
       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
         return std::make_pair(0U, &X86::GR32RegClass);
       return std::make_pair(0U, &X86::GR64RegClass);
     case 'R':   // LEGACY_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
       if (VT == MVT::i32 || !Subtarget.is64Bit())
         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
     case 'f':  // FP Stack registers.
       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
       // value to the correct fpstack register class.
       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
         return std::make_pair(0U, &X86::RFP32RegClass);
       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
         return std::make_pair(0U, &X86::RFP64RegClass);
       return std::make_pair(0U, &X86::RFP80RegClass);
     case 'y':   // MMX_REGS if MMX allowed.
       if (!Subtarget.hasMMX()) break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'Y':   // SSE_REGS if SSE2 allowed
       if (!Subtarget.hasSSE2()) break;
       LLVM_FALLTHROUGH;
     case 'v':
     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
       if (!Subtarget.hasSSE1()) break;
       bool VConstraint = (Constraint[0] == 'v');
 
       switch (VT.SimpleTy) {
       default: break;
       // Scalar SSE types.
       case MVT::f32:
       case MVT::i32:
         if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::FR32XRegClass);
         return std::make_pair(0U, &X86::FR32RegClass);
       case MVT::f64:
       case MVT::i64:
         if (VConstraint && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::FR64XRegClass);
         return std::make_pair(0U, &X86::FR64RegClass);
       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
       // Vector types.
       case MVT::v16i8:
       case MVT::v8i16:
       case MVT::v4i32:
       case MVT::v2i64:
       case MVT::v4f32:
       case MVT::v2f64:
         if (VConstraint && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::VR128XRegClass);
         return std::make_pair(0U, &X86::VR128RegClass);
       // AVX types.
       case MVT::v32i8:
       case MVT::v16i16:
       case MVT::v8i32:
       case MVT::v4i64:
       case MVT::v8f32:
       case MVT::v4f64:
         if (VConstraint && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::VR256XRegClass);
         return std::make_pair(0U, &X86::VR256RegClass);
       case MVT::v8f64:
       case MVT::v16f32:
       case MVT::v16i32:
       case MVT::v8i64:
         return std::make_pair(0U, &X86::VR512RegClass);
       }
       break;
     }
   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
     switch (Constraint[1]) {
     default:
       break;
     case 'i':
     case 't':
     case '2':
       return getRegForInlineAsmConstraint(TRI, "Y", VT);
     case 'm':
       if (!Subtarget.hasMMX()) break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'z':
     case '0':
       if (!Subtarget.hasSSE1()) break;
       return std::make_pair(X86::XMM0, &X86::VR128RegClass);
     case 'k':
       // This register class doesn't allocate k0 for masked vector operation.
       if (Subtarget.hasAVX512()) { // Only supported in AVX512.
         switch (VT.SimpleTy) {
         default: break;
         case MVT::i32:
           return std::make_pair(0U, &X86::VK32WMRegClass);
         case MVT::i16:
           return std::make_pair(0U, &X86::VK16WMRegClass);
         case MVT::i8:
           return std::make_pair(0U, &X86::VK8WMRegClass);
         case MVT::i1:
           return std::make_pair(0U, &X86::VK1WMRegClass);
         case MVT::i64:
           return std::make_pair(0U, &X86::VK64WMRegClass);
         }
       }
       break;
     }
   }
 
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass*> Res;
   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // Not found as a standard register?
   if (!Res.second) {
     // Map st(0) -> st(7) -> ST0
     if (Constraint.size() == 7 && Constraint[0] == '{' &&
         tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
         Constraint[3] == '(' &&
         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
         Constraint[5] == ')' && Constraint[6] == '}') {
       // st(7) is not allocatable and thus not a member of RFP80. Return
       // singleton class in cases where we have a reference to it.
       if (Constraint[4] == '7')
         return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
       return std::make_pair(X86::FP0 + Constraint[4] - '0',
                             &X86::RFP80RegClass);
     }
 
     // GCC allows "st(0)" to be called just plain "st".
     if (StringRef("{st}").equals_lower(Constraint))
       return std::make_pair(X86::FP0, &X86::RFP80RegClass);
 
     // flags -> EFLAGS
     if (StringRef("{flags}").equals_lower(Constraint))
       return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
 
     // dirflag -> DF
     if (StringRef("{dirflag}").equals_lower(Constraint))
       return std::make_pair(X86::DF, &X86::DFCCRRegClass);
 
     // fpsr -> FPSW
     if (StringRef("{fpsr}").equals_lower(Constraint))
       return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
 
     // 'A' means [ER]AX + [ER]DX.
     if (Constraint == "A") {
       if (Subtarget.is64Bit())
         return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
       assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
              "Expecting 64, 32 or 16 bit subtarget");
       return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
     }
     return Res;
   }
 
   // Make sure it isn't a register that requires 64-bit mode.
   if (!Subtarget.is64Bit() &&
       (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
       TRI->getEncodingValue(Res.first) >= 8) {
     // Register requires REX prefix, but we're in 32-bit mode.
     return std::make_pair(0, nullptr);
   }
 
   // Make sure it isn't a register that requires AVX512.
   if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
       TRI->getEncodingValue(Res.first) & 0x10) {
     // Register requires EVEX prefix.
     return std::make_pair(0, nullptr);
   }
 
   // Otherwise, check to see if this is a register class of the wrong value
   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
   // turn into {ax},{dx}.
   // MVT::Other is used to specify clobber names.
   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
     return Res;   // Correct type already, nothing to do.
 
   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
   // return "eax". This should even work for things like getting 64bit integer
   // registers when given an f64 type.
   const TargetRegisterClass *Class = Res.second;
   // The generic code will match the first register class that contains the
   // given register. Thus, based on the ordering of the tablegened file,
   // the "plain" GR classes might not come first.
   // Therefore, use a helper method.
   if (isGRClass(*Class)) {
     unsigned Size = VT.getSizeInBits();
     if (Size == 1) Size = 8;
     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
     if (DestReg > 0) {
       bool is64Bit = Subtarget.is64Bit();
       const TargetRegisterClass *RC =
           Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
         : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
         : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
         : Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)
         : nullptr;
       if (Size == 64 && !is64Bit) {
         // Model GCC's behavior here and select a fixed pair of 32-bit
         // registers.
         switch (Res.first) {
         case X86::EAX:
           return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
         case X86::EDX:
           return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
         case X86::ECX:
           return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
         case X86::EBX:
           return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
         case X86::ESI:
           return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
         case X86::EDI:
           return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
         case X86::EBP:
           return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
         default:
           return std::make_pair(0, nullptr);
         }
       }
       if (RC && RC->contains(DestReg))
         return std::make_pair(DestReg, RC);
       return Res;
     }
     // No register found/type mismatch.
     return std::make_pair(0, nullptr);
   } else if (isFRClass(*Class)) {
     // Handle references to XMM physical registers that got mapped into the
     // wrong class.  This can happen with constraints like {xmm0} where the
     // target independent register mapper will just pick the first match it can
     // find, ignoring the required type.
 
     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
     if (VT == MVT::f32 || VT == MVT::i32)
       Res.second = &X86::FR32RegClass;
     else if (VT == MVT::f64 || VT == MVT::i64)
       Res.second = &X86::FR64RegClass;
     else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
       Res.second = &X86::VR128RegClass;
     else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
       Res.second = &X86::VR256RegClass;
     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
       Res.second = &X86::VR512RegClass;
     else {
       // Type mismatch and not a clobber: Return an error;
       Res.first = 0;
       Res.second = nullptr;
     }
   }
 
   return Res;
 }
 
 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
                                             const AddrMode &AM, Type *Ty,
                                             unsigned AS) const {
   // Scaling factors are not free at all.
   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
   // will take 2 allocations in the out of order engine instead of 1
   // for plain addressing mode, i.e. inst (reg1).
   // E.g.,
   // vaddps (%rsi,%rdx), %ymm0, %ymm1
   // Requires two allocations (one for the load, one for the computation)
   // whereas:
   // vaddps (%rsi), %ymm0, %ymm1
   // Requires just 1 allocation, i.e., freeing allocations for other operations
   // and having less micro operations to execute.
   //
   // For some X86 architectures, this is even worse because for instance for
   // stores, the complex addressing mode forces the instruction to use the
   // "load" ports instead of the dedicated "store" port.
   // E.g., on Haswell:
   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
   if (isLegalAddressingMode(DL, AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1
     // as soon as we use a second register.
     return AM.Scale != 0;
   return -1;
 }
 
 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on x86 is expensive. However, when aggressively optimizing
   // for code size, we prefer to use a div instruction, as it is usually smaller
   // than the alternative sequence.
   // The exception to this is vector division. Since x86 doesn't have vector
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
   bool OptSize =
       Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
   if (!Subtarget.is64Bit())
     return;
 
   // Update IsSplitCSR in X86MachineFunctionInfo.
   X86MachineFunctionInfo *AFI =
     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
   AFI->setIsSplitCSR(true);
 }
 
 void X86TargetLowering::insertCopiesSplitCSR(
     MachineBasicBlock *Entry,
     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
   if (!IStart)
     return;
 
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
   MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (X86::GR64RegClass.contains(*I))
       RC = &X86::GR64RegClass;
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
     unsigned NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
     // nounwind. If we want to generalize this later, we may need to emit
     // CFI pseudo-instructions.
     assert(Entry->getParent()->getFunction().hasFnAttribute(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
     // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
               TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
 
 bool X86TargetLowering::supportSwiftError() const {
   return Subtarget.is64Bit();
 }
 
 /// Returns the name of the symbol used to emit stack probes or the empty
 /// string if not applicable.
 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
   // If the function specifically requests stack probes, emit them.
   if (MF.getFunction().hasFnAttribute("probe-stack"))
     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
 
   // Generally, if we aren't on Windows, the platform ABI does not include
   // support for stack probes, so don't emit them.
   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
       MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
     return "";
 
   // We need a stack probe to conform to the Windows ABI. Choose the right
   // symbol.
   if (Subtarget.is64Bit())
     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
 }
Index: vendor/llvm/dist-release_80/test/CodeGen/AArch64/win64-jumptable.ll
===================================================================
--- vendor/llvm/dist-release_80/test/CodeGen/AArch64/win64-jumptable.ll	(nonexistent)
+++ vendor/llvm/dist-release_80/test/CodeGen/AArch64/win64-jumptable.ll	(revision 344765)
@@ -0,0 +1,48 @@
+; RUN: llc -o - %s -mtriple=aarch64-windows -aarch64-enable-compress-jump-tables=0 | FileCheck %s
+
+define void @f(i32 %x) {
+entry:
+  switch i32 %x, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb:                                            ; preds = %entry
+  tail call void @g(i32 0) #2
+  br label %sw.epilog
+
+sw.bb1:                                           ; preds = %entry
+  tail call void @g(i32 1) #2
+  br label %sw.epilog
+
+sw.bb2:                                           ; preds = %entry
+  tail call void @g(i32 2) #2
+  br label %sw.epilog
+
+sw.bb3:                                           ; preds = %entry
+  tail call void @g(i32 3) #2
+  br label %sw.epilog
+
+sw.epilog:                                        ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1, %sw.bb
+  tail call void @g(i32 10) #2
+  ret void
+}
+
+declare void @g(i32)
+
+; CHECK:		.text
+; CHECK:		f:
+; CHECK:		.seh_proc f
+; CHECK:		b	g
+; CHECK-NEXT:	.p2align	2
+; CHECK-NEXT:	.LJTI0_0:
+; CHECK:		.word	.LBB0_2-.LJTI0_0
+; CHECK:		.word	.LBB0_3-.LJTI0_0
+; CHECK:		.word	.LBB0_4-.LJTI0_0
+; CHECK:		.word	.LBB0_5-.LJTI0_0
+; CHECK:		.section	.xdata,"dr"
+; CHECK:		.seh_handlerdata
+; CHECK:		.text
+; CHECK:		.seh_endproc
Index: vendor/llvm/dist-release_80/test/CodeGen/AArch64/wineh-try-catch-cbz.ll
===================================================================
--- vendor/llvm/dist-release_80/test/CodeGen/AArch64/wineh-try-catch-cbz.ll	(nonexistent)
+++ vendor/llvm/dist-release_80/test/CodeGen/AArch64/wineh-try-catch-cbz.ll	(revision 344765)
@@ -0,0 +1,40 @@
+; RUN: llc < %s | FileCheck %s
+
+; Make sure the prologue is sane.  (Doesn't need to exactly match this,
+; but the original issue only reproduced if the cbz was immediately
+; after the frame setup.)
+
+; CHECK:      sub     sp, sp, #32
+; CHECK-NEXT: stp     x29, x30, [sp, #16]
+; CHECK-NEXT: add     x29, sp, #16
+; CHECK-NEXT: orr     x1, xzr, #0xfffffffffffffffe
+; CHECK-NEXT: stur    x1, [x29, #-16]
+; CHECK-NEXT: cbz     w0, .LBB0_2
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-windows-msvc19.11.0"
+
+; Function Attrs: uwtable
+define dso_local void @"?f@@YAXH@Z"(i32 %x) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %try.cont, label %if.then
+
+if.then:                                          ; preds = %entry
+  invoke void @"?g@@YAXXZ"()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %if.then
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %entry, %if.then, %catch
+  ret void
+}
+
+declare dso_local void @"?g@@YAXXZ"() local_unnamed_addr #1
+
+declare dso_local i32 @__CxxFrameHandler3(...)
Index: vendor/llvm/dist-release_80/test/CodeGen/AArch64/wineh-try-catch.ll
===================================================================
--- vendor/llvm/dist-release_80/test/CodeGen/AArch64/wineh-try-catch.ll	(revision 344764)
+++ vendor/llvm/dist-release_80/test/CodeGen/AArch64/wineh-try-catch.ll	(revision 344765)
@@ -1,197 +1,197 @@
 ; RUN: llc -o - %s -mtriple=aarch64-windows -verify-machineinstrs | FileCheck %s
 ; RUN: llc -o %t -filetype=obj %s -mtriple=aarch64-windows
 ; RUN: llvm-readobj -unwind %t | FileCheck %s -check-prefix=UNWIND
 
 ; We test the following
 ; 1) That the unwind help object is created and that its offset from the stack
 ;    pointer on entry is patched into the table fed to __CxxFrameHandler3
 ; 2) That the stack update for the catch funclet only includes the callee saved
 ;    registers
 ; 3) That the locals are accessed using the frame pointer in both the funclet
 ;    and the parent function.
 
 ; The following checks that the unwind help object has -2 stored into it at
 ; fp - 400 - 256 = fp - 656, which is on-entry sp - 48 + 32 - 656 =
 ; on-entry sp - 672.  We check this offset in the table later on.
 
 ; CHECK-LABEL: "?func@@YAHXZ":
 ; CHECK:       str     x28, [sp, #-48]!
 ; CHECK:       str     x21, [sp, #8]
 ; CHECK:       stp     x19, x20, [sp, #16]
 ; CHECK:       stp     x29, x30, [sp, #32]
 ; CHECK:       add     x29, sp, #32
 ; CHECK:       sub     sp, sp, #624
 ; CHECK:       mov     x19, sp
-; CHECK:       orr     x1, xzr, #0xfffffffffffffffe
-; CHECK:       stur    x1, [x19]
+; CHECK:       orr     x0, xzr, #0xfffffffffffffffe
+; CHECK:       stur    x0, [x19]
 
 ; Now check that x is stored at fp - 20.  We check that this is the same
 ; location accessed from the funclet to retrieve x.
 ; CHECK:       orr     w8, wzr, #0x1
 ; CHECK:       stur    w8, [x29, [[X_OFFSET:#-[1-9][0-9]+]]
 
 ; Check the offset off the frame pointer at which B is located.
 ; Check the same offset is used to pass the address of B to init2 in the
 ; funclet.
 ; CHECK:       sub     x0, x29, [[B_OFFSET:#[1-9][0-9]+]]
 ; CHECK:       bl      "?init@@YAXPEAH@Z"
 
 ; This is the label for the throw that is encoded in the ip2state.
 ; We are inside the try block, where we make a call to func2
 ; CHECK-LABEL: .Ltmp0:
 ; CHECK:       bl      "?func2@@YAHXZ
 
 ; CHECK:        [[CATCHRETDEST:.LBB0_[0-9]+]]:      ; %catchret.dest
 
 ; Check the catch funclet.
 ; CHECK-LABEL: "?catch$2@?0??func@@YAHXZ@4HA":
 
 ; Check that the stack space is allocated only for the callee saved registers.
 ; CHECK:       str     x28, [sp, #-48]!
 ; CHECK:       str     x21, [sp, #8]
 ; CHECK:       stp     x19, x20, [sp, #16]
 ; CHECK:       stp     x29, x30, [sp, #32]
 ; CHECK:       add     x20, x19, #12
 
 ; Check that there are no further stack updates.
 ; CHECK-NOT:   sub     sp, sp
 
 ; Check that the stack address passed to init2 is off the frame pointer, and
 ; that it matches the address of B in the parent function.
 ; CHECK:       sub     x0, x29, [[B_OFFSET]]
 ; CHECK:       bl      "?init2@@YAXPEAH@Z"
 
 ; Check that are storing x back to the same location off the frame pointer as in
 ; the parent function.
 ; CHECK:       stur    w8, [x29, [[X_OFFSET]]]
 
 ; Check that the funclet branches back to the catchret destination
 ; CHECK:       adrp    x0, .LBB0_3
 ; CHECK-NEXT:  add     x0, x0, [[CATCHRETDEST]]
 
 
 ; Now check that the offset of the unwind help object from the stack pointer on
 ; entry to func is encoded in cppxdata that is passed to __CxxFrameHandler3.  As
 ; computed above, this comes to -672.
 ; CHECK-LABEL:        "$cppxdata$?func@@YAHXZ":
 ; CHECK-NEXT:         .word   429065506               ; MagicNumber
 ; CHECK-NEXT:         .word   2                       ; MaxState
 ; CHECK-NEXT:         .word   ("$stateUnwindMap$?func@@YAHXZ")@IMGREL ; UnwindMap
 ; CHECK-NEXT:         .word   1                       ; NumTryBlocks
 ; CHECK-NEXT:         .word   ("$tryMap$?func@@YAHXZ")@IMGREL ; TryBlockMap
 ; CHECK-NEXT:         .word   4                       ; IPMapEntries
 ; CHECK-NEXT:         .word   ("$ip2state$?func@@YAHXZ")@IMGREL ; IPToStateXData
 ; CHECK-NEXT:         .word   -672                    ; UnwindHelp
 
 ; UNWIND: Function: ?func@@YAHXZ (0x0)
 ; UNWIND: Prologue [
 ; UNWIND-NEXT: ; nop
 ; UNWIND-NEXT: ; sub sp, #624
 ; UNWIND-NEXT: ; add fp, sp, #32
 ; UNWIND-NEXT: ; stp x29, x30, [sp, #32]
 ; UNWIND-NEXT: ; stp x19, x20, [sp, #16]
 ; UNWIND-NEXT: ; str x21, [sp, #8]
 ; UNWIND-NEXT: ; str x28, [sp, #48]!
 ; UNWIND-NEXT: ; end
 ; UNWIND: Function: ?catch$2@?0??func@@YAHXZ@4HA
 ; UNWIND: Prologue [
 ; UNWIND-NEXT: ; stp x29, x30, [sp, #32]
 ; UNWIND-NEXT: ; stp x19, x20, [sp, #16]
 ; UNWIND-NEXT: ; str x21, [sp, #8]
 ; UNWIND-NEXT: ; str x28, [sp, #48]!
 ; UNWIND-NEXT: ; end
 
 target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-windows-msvc19.11.0"
 
 %rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
 %eh.CatchableType = type { i32, i32, i32, i32, i32, i32, i32 }
 %eh.CatchableTypeArray.1 = type { i32, [1 x i32] }
 %eh.ThrowInfo = type { i32, i32, i32, i32 }
 
 $"??_R0H@8" = comdat any
 
 $"_CT??_R0H@84" = comdat any
 
 $_CTA1H = comdat any
 
 $_TI1H = comdat any
 
 @"??_7type_info@@6B@" = external constant i8*
 @"??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
 @__ImageBase = external dso_local constant i8
 @"_CT??_R0H@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%rtti.TypeDescriptor2* @"??_R0H@8" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32), i32 0, i32 -1, i32 0, i32 4, i32 0 }, section ".xdata", comdat
 @_CTA1H = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x i32] [i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableType* @"_CT??_R0H@84" to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32)] }, section ".xdata", comdat
 @_TI1H = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i32 0, i32 0, i32 trunc (i64 sub nuw nsw (i64 ptrtoint (%eh.CatchableTypeArray.1* @_CTA1H to i64), i64 ptrtoint (i8* @__ImageBase to i64)) to i32) }, section ".xdata", comdat
 
 ; Function Attrs: noinline optnone
 define dso_local i32 @"?func@@YAHXZ"() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
 entry:
   %B = alloca [50 x i32], align 4
   %x = alloca i32, align 4
   %tmp = alloca i32, align 4
   %i = alloca i32, align 4
   %C = alloca [100 x i32], align 4
   store i32 1, i32* %x, align 4
   %arraydecay = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 0, i32 0
   call void @"?init@@YAXPEAH@Z"(i32* %arraydecay)
   %call = invoke i32 @"?func2@@YAHXZ"()
           to label %invoke.cont unwind label %catch.dispatch
 
 invoke.cont:                                      ; preds = %entry
   store i32 %call, i32* %tmp, align 4
   %0 = bitcast i32* %tmp to i8*
   invoke void @_CxxThrowException(i8* %0, %eh.ThrowInfo* @_TI1H) #2
           to label %unreachable unwind label %catch.dispatch
 
 catch.dispatch:                                   ; preds = %invoke.cont, %entry
   %1 = catchswitch within none [label %catch] unwind to caller
 
 catch:                                            ; preds = %catch.dispatch
   %2 = catchpad within %1 [%rtti.TypeDescriptor2* @"??_R0H@8", i32 0, i32* %i]
   %arraydecay1 = getelementptr inbounds [100 x i32], [100 x i32]* %C, i32 0, i32 0
   call void @"?init@@YAXPEAH@Z"(i32* %arraydecay1) [ "funclet"(token %2) ]
   %arraydecay2 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i32 0, i32 0
   call void @"?init2@@YAXPEAH@Z"(i32* %arraydecay2) [ "funclet"(token %2) ]
   %3 = load i32, i32* %i, align 4
   %idxprom = sext i32 %3 to i64
   %arrayidx = getelementptr inbounds [50 x i32], [50 x i32]* %B, i64 0, i64 %idxprom
   %4 = load i32, i32* %arrayidx, align 4
   %5 = load i32, i32* %i, align 4
   %idxprom3 = sext i32 %5 to i64
   %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* %C, i64 0, i64 %idxprom3
   %6 = load i32, i32* %arrayidx4, align 4
   %add = add nsw i32 %4, %6
   %7 = load i32, i32* %i, align 4
   %8 = load i32, i32* %i, align 4
   %mul = mul nsw i32 %7, %8
   %add5 = add nsw i32 %add, %mul
   store i32 %add5, i32* %x, align 4
   catchret from %2 to label %catchret.dest
 
 catchret.dest:                                    ; preds = %catch
   br label %try.cont
 
 try.cont:                                         ; preds = %catchret.dest
   %arrayidx6 = getelementptr inbounds [50 x i32], [50 x i32]* %B, i64 0, i64 2
   %9 = load i32, i32* %arrayidx6, align 4
   %10 = load i32, i32* %x, align 4
   %add7 = add nsw i32 %9, %10
   ret i32 %add7
 
 unreachable:                                      ; preds = %invoke.cont
   unreachable
 }
 
 declare dso_local void @"?init@@YAXPEAH@Z"(i32*)
 
 declare dso_local i32 @"?func2@@YAHXZ"()
 
 declare dso_local i32 @__CxxFrameHandler3(...)
 
 declare dllimport void @_CxxThrowException(i8*, %eh.ThrowInfo*)
 
 declare dso_local void @"?init2@@YAXPEAH@Z"(i32*)
 
 attributes #0 = { noinline optnone }
 attributes #2 = { noreturn }
Index: vendor/llvm/dist-release_80/test/CodeGen/WebAssembly/select.ll
===================================================================
--- vendor/llvm/dist-release_80/test/CodeGen/WebAssembly/select.ll	(revision 344764)
+++ vendor/llvm/dist-release_80/test/CodeGen/WebAssembly/select.ll	(revision 344765)
@@ -1,159 +1,176 @@
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefixes CHECK,SLOW
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 ; Test that wasm select instruction is selected from LLVM select instruction.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; CHECK-LABEL: select_i32_bool:
 ; CHECK-NEXT: .functype select_i32_bool (i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_bool(i1 zeroext %a, i32 %b, i32 %c) {
   %cond = select i1 %a, i32 %b, i32 %c
   ret i32 %cond
 }
 
 ; CHECK-LABEL: select_i32_bool_nozext:
 ; CHECK-NEXT: .functype select_i32_bool_nozext (i32, i32, i32) -> (i32){{$}}
-; SLOW-NEXT: i32.select $push0=, $1, $2, $0{{$}}
-; SLOW-NEXT: return     $pop0{{$}}
+; SLOW-NEXT: i32.const  $push0=, 1{{$}}
+; SLOW-NEXT: i32.and    $push1=, $0, $pop0{{$}}
+; SLOW-NEXT: i32.select $push2=, $1, $2, $pop1{{$}}
+; SLOW-NEXT: return     $pop2{{$}}
 define i32 @select_i32_bool_nozext(i1 %a, i32 %b, i32 %c) {
   %cond = select i1 %a, i32 %b, i32 %c
   ret i32 %cond
 }
 
 ; CHECK-LABEL: select_i32_eq:
 ; CHECK-NEXT: .functype select_i32_eq (i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.select $push0=, $2, $1, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_eq(i32 %a, i32 %b, i32 %c) {
   %cmp = icmp eq i32 %a, 0
   %cond = select i1 %cmp, i32 %b, i32 %c
   ret i32 %cond
 }
 
 ; CHECK-LABEL: select_i32_ne:
 ; CHECK-NEXT: .functype select_i32_ne (i32, i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i32 @select_i32_ne(i32 %a, i32 %b, i32 %c) {
   %cmp = icmp ne i32 %a, 0
   %cond = select i1 %cmp, i32 %b, i32 %c
   ret i32 %cond
 }
 
 ; CHECK-LABEL: select_i64_bool:
 ; CHECK-NEXT: .functype select_i64_bool (i32, i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: i64.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_bool(i1 zeroext %a, i64 %b, i64 %c) {
   %cond = select i1 %a, i64 %b, i64 %c
   ret i64 %cond
 }
 
 ; CHECK-LABEL: select_i64_bool_nozext:
 ; CHECK-NEXT: .functype select_i64_bool_nozext (i32, i64, i64) -> (i64){{$}}
-; SLOW-NEXT: i64.select $push0=, $1, $2, $0{{$}}
-; SLOW-NEXT: return     $pop0{{$}}
+; SLOW-NEXT: i32.const  $push0=, 1{{$}}
+; SLOW-NEXT: i32.and    $push1=, $0, $pop0{{$}}
+; SLOW-NEXT: i64.select $push2=, $1, $2, $pop1{{$}}
+; SLOW-NEXT: return     $pop2{{$}}
 define i64 @select_i64_bool_nozext(i1 %a, i64 %b, i64 %c) {
   %cond = select i1 %a, i64 %b, i64 %c
   ret i64 %cond
 }
 
 ; CHECK-LABEL: select_i64_eq:
 ; CHECK-NEXT: .functype select_i64_eq (i32, i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: i64.select $push0=, $2, $1, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_eq(i32 %a, i64 %b, i64 %c) {
   %cmp = icmp eq i32 %a, 0
   %cond = select i1 %cmp, i64 %b, i64 %c
   ret i64 %cond
 }
 
 ; CHECK-LABEL: select_i64_ne:
 ; CHECK-NEXT: .functype select_i64_ne (i32, i64, i64) -> (i64){{$}}
 ; CHECK-NEXT: i64.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define i64 @select_i64_ne(i32 %a, i64 %b, i64 %c) {
   %cmp = icmp ne i32 %a, 0
   %cond = select i1 %cmp, i64 %b, i64 %c
   ret i64 %cond
 }
 
 ; CHECK-LABEL: select_f32_bool:
 ; CHECK-NEXT: .functype select_f32_bool (i32, f32, f32) -> (f32){{$}}
 ; CHECK-NEXT: f32.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_bool(i1 zeroext %a, float %b, float %c) {
   %cond = select i1 %a, float %b, float %c
   ret float %cond
 }
 
 ; CHECK-LABEL: select_f32_bool_nozext:
 ; CHECK-NEXT: .functype select_f32_bool_nozext (i32, f32, f32) -> (f32){{$}}
 ; SLOW-NEXT: f32.select $push0=, $1, $2, $0{{$}}
 ; SLOW-NEXT: return     $pop0{{$}}
 define float @select_f32_bool_nozext(i1 %a, float %b, float %c) {
   %cond = select i1 %a, float %b, float %c
   ret float %cond
 }
 
 ; CHECK-LABEL: select_f32_eq:
 ; CHECK-NEXT: .functype select_f32_eq (i32, f32, f32) -> (f32){{$}}
 ; CHECK-NEXT: f32.select $push0=, $2, $1, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_eq(i32 %a, float %b, float %c) {
   %cmp = icmp eq i32 %a, 0
   %cond = select i1 %cmp, float %b, float %c
   ret float %cond
 }
 
 ; CHECK-LABEL: select_f32_ne:
 ; CHECK-NEXT: .functype select_f32_ne (i32, f32, f32) -> (f32){{$}}
 ; CHECK-NEXT: f32.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define float @select_f32_ne(i32 %a, float %b, float %c) {
   %cmp = icmp ne i32 %a, 0
   %cond = select i1 %cmp, float %b, float %c
   ret float %cond
 }
 
 ; CHECK-LABEL: select_f64_bool:
 ; CHECK-NEXT: .functype select_f64_bool (i32, f64, f64) -> (f64){{$}}
 ; CHECK-NEXT: f64.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_bool(i1 zeroext %a, double %b, double %c) {
   %cond = select i1 %a, double %b, double %c
   ret double %cond
 }
 
 ; CHECK-LABEL: select_f64_bool_nozext:
 ; CHECK-NEXT: .functype select_f64_bool_nozext (i32, f64, f64) -> (f64){{$}}
 ; SLOW-NEXT: f64.select $push0=, $1, $2, $0{{$}}
 ; SLOW-NEXT: return     $pop0{{$}}
 define double @select_f64_bool_nozext(i1 %a, double %b, double %c) {
   %cond = select i1 %a, double %b, double %c
   ret double %cond
 }
 
 ; CHECK-LABEL: select_f64_eq:
 ; CHECK-NEXT: .functype select_f64_eq (i32, f64, f64) -> (f64){{$}}
 ; CHECK-NEXT: f64.select $push0=, $2, $1, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_eq(i32 %a, double %b, double %c) {
   %cmp = icmp eq i32 %a, 0
   %cond = select i1 %cmp, double %b, double %c
   ret double %cond
 }
 
 ; CHECK-LABEL: select_f64_ne:
 ; CHECK-NEXT: .functype select_f64_ne (i32, f64, f64) -> (f64){{$}}
 ; CHECK-NEXT: f64.select $push0=, $1, $2, $0{{$}}
 ; CHECK-NEXT: return     $pop0{{$}}
 define double @select_f64_ne(i32 %a, double %b, double %c) {
   %cmp = icmp ne i32 %a, 0
   %cond = select i1 %cmp, double %b, double %c
   ret double %cond
+}
+
+; CHECK-LABEL: pr40805:
+; CHECK-NEXT: .functype pr40805 (i32, i32, i32) -> (i32){{$}}
+; SLOW-NEXT: i32.const  $push0=, 1{{$}}
+; SLOW-NEXT: i32.and    $push1=, $0, $pop0{{$}}
+; SLOW-NEXT: i32.select $push2=, $1, $2, $pop1{{$}}
+; SLOW-NEXT: return     $pop2{{$}}
+define i32 @pr40805(i32 %x, i32 %y, i32 %z) {
+  %a = and i32 %x, 1
+  %b = icmp ne i32 %a, 0
+  %c = select i1 %b, i32 %y, i32 %z
+  ret i32 %c
 }
Index: vendor/llvm/dist-release_80/test/CodeGen/WebAssembly/simd-select.ll
===================================================================
--- vendor/llvm/dist-release_80/test/CodeGen/WebAssembly/simd-select.ll	(revision 344764)
+++ vendor/llvm/dist-release_80/test/CodeGen/WebAssembly/simd-select.ll	(revision 344765)
@@ -1,433 +1,433 @@
 ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s
 
 ; Test that vector selects of various varieties lower correctly to bitselects.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
 target triple = "wasm32-unknown-unknown"
 
 ; ==============================================================================
 ; 16 x i8
 ; ==============================================================================
 ; CHECK-LABEL: vselect_v16i8:
 ; CHECK-NEXT: .functype vselect_v16i8 (v128, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 7{{$}}
 ; CHECK-NEXT: i8x16.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 7{{$}}
 ; CHECK-NEXT: i8x16.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @vselect_v16i8(<16 x i1> %c, <16 x i8> %x, <16 x i8> %y) {
   %res = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %res
 }
 
 ; CHECK-LABEL: select_v16i8:
 ; CHECK-NEXT: .functype select_v16i8 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <16 x i8> @select_v16i8(i1 %c, <16 x i8> %x, <16 x i8> %y) {
+define <16 x i8> @select_v16i8(i1 zeroext %c, <16 x i8> %x, <16 x i8> %y) {
   %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %res
 }
 
 ; CHECK-LABEL: select_cmp_v16i8:
 ; CHECK-NEXT: .functype select_cmp_v16i8 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31
 ; CHECK-NEXT: i32.shr_s $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i8x16.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L2]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @select_cmp_v16i8(i32 %i, <16 x i8> %x, <16 x i8> %y) {
   %c = icmp slt i32 %i, 0
   %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %res
 }
 
 ; CHECK-LABEL: select_ne_v16i8:
 ; CHECK-NEXT: .functype select_ne_v16i8 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @select_ne_v16i8(i32 %i, <16 x i8> %x, <16 x i8> %y) {
   %c = icmp ne i32 %i, 0
   %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %res
 }
 
 ; CHECK-LABEL: select_eq_v16i8:
 ; CHECK-NEXT: .functype select_eq_v16i8 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i8x16.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <16 x i8> @select_eq_v16i8(i32 %i, <16 x i8> %x, <16 x i8> %y) {
   %c = icmp eq i32 %i, 0
   %res = select i1 %c, <16 x i8> %x, <16 x i8> %y
   ret <16 x i8> %res
 }
 
 ; ==============================================================================
 ; 8 x i16
 ; ==============================================================================
 ; CHECK-LABEL: vselect_v8i16:
 ; CHECK-NEXT: .functype vselect_v8i16 (v128, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 15{{$}}
 ; CHECK-NEXT: i16x8.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 15{{$}}
 ; CHECK-NEXT: i16x8.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @vselect_v8i16(<8 x i1> %c, <8 x i16> %x, <8 x i16> %y) {
   %res = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %res
 }
 
 ; CHECK-LABEL: select_v8i16:
 ; CHECK-NEXT: .functype select_v8i16 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <8 x i16> @select_v8i16(i1 %c, <8 x i16> %x, <8 x i16> %y) {
+define <8 x i16> @select_v8i16(i1 zeroext %c, <8 x i16> %x, <8 x i16> %y) {
   %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %res
 }
 
 ; CHECK-LABEL: select_cmp_v8i16:
 ; CHECK-NEXT: .functype select_cmp_v8i16 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
 ; CHECK-NEXT: i32.shr_s $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i16x8.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L2]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @select_cmp_v8i16(i32 %i, <8 x i16> %x, <8 x i16> %y) {
   %c = icmp slt i32 %i, 0
   %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %res
 }
 
 ; CHECK-LABEL: select_ne_v8i16:
 ; CHECK-NEXT: .functype select_ne_v8i16 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @select_ne_v8i16(i32 %i, <8 x i16> %x, <8 x i16> %y) {
   %c = icmp ne i32 %i, 0
   %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %res
 }
 
 ; CHECK-LABEL: select_eq_v8i16:
 ; CHECK-NEXT: .functype select_eq_v8i16 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i16x8.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <8 x i16> @select_eq_v8i16(i32 %i, <8 x i16> %x, <8 x i16> %y) {
   %c = icmp eq i32 %i, 0
   %res = select i1 %c, <8 x i16> %x, <8 x i16> %y
   ret <8 x i16> %res
 }
 
 ; ==============================================================================
 ; 4 x i32
 ; ==============================================================================
 ; CHECK-LABEL: vselect_v4i32:
 ; CHECK-NEXT: .functype vselect_v4i32 (v128, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
 ; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
 ; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @vselect_v4i32(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y) {
   %res = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %res
 }
 
 
 ; CHECK-LABEL: select_v4i32:
 ; CHECK-NEXT: .functype select_v4i32 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <4 x i32> @select_v4i32(i1 %c, <4 x i32> %x, <4 x i32> %y) {
+define <4 x i32> @select_v4i32(i1 zeroext %c, <4 x i32> %x, <4 x i32> %y) {
   %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %res
 }
 
 ; CHECK-LABEL: select_cmp_v4i32:
 ; CHECK-NEXT: .functype select_cmp_v4i32 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
 ; CHECK-NEXT: i32.shr_s $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32x4.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L2]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @select_cmp_v4i32(i32 %i, <4 x i32> %x, <4 x i32> %y) {
   %c = icmp slt i32 %i, 0
   %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %res
 }
 
 ; CHECK-LABEL: select_ne_v4i32:
 ; CHECK-NEXT: .functype select_ne_v4i32 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @select_ne_v4i32(i32 %i, <4 x i32> %x, <4 x i32> %y) {
   %c = icmp ne i32 %i, 0
   %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %res
 }
 
 ; CHECK-LABEL: select_eq_v4i32:
 ; CHECK-NEXT: .functype select_eq_v4i32 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x i32> @select_eq_v4i32(i32 %i, <4 x i32> %x, <4 x i32> %y) {
   %c = icmp eq i32 %i, 0
   %res = select i1 %c, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %res
 }
 
 ; ==============================================================================
 ; 2 x i64
 ; ==============================================================================
 ; CHECK-LABEL: vselect_v2i64:
 ; CHECK-NEXT: .functype vselect_v2i64 (v128, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
 ; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
 ; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @vselect_v2i64(<2 x i1> %c, <2 x i64> %x, <2 x i64> %y) {
   %res = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %y
   ret <2 x i64> %res
 }
 
 ; CHECK-LABEL: select_v2i64:
 ; CHECK-NEXT: .functype select_v2i64 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <2 x i64> @select_v2i64(i1 %c, <2 x i64> %x, <2 x i64> %y) {
+define <2 x i64> @select_v2i64(i1 zeroext %c, <2 x i64> %x, <2 x i64> %y) {
   %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
   ret <2 x i64> %res
 }
 
 ; CHECK-LABEL: select_cmp_v2i64:
 ; CHECK-NEXT: .functype select_cmp_v2i64 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.lt_s $push[[L3:[0-9]+]]=, $0, $pop[[L2]]{{$}}
 ; CHECK-NEXT: i64.select $push[[L4:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $pop[[L3]]{{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L5:[0-9]+]]=, $pop[[L4]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L5]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @select_cmp_v2i64(i32 %i, <2 x i64> %x, <2 x i64> %y) {
   %c = icmp slt i32 %i, 0
   %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
   ret <2 x i64> %res
 }
 
 ; CHECK-LABEL: select_ne_v2i64:
 ; CHECK-NEXT: .functype select_ne_v2i64 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @select_ne_v2i64(i32 %i, <2 x i64> %x, <2 x i64> %y) {
   %c = icmp ne i32 %i, 0
   %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
   ret <2 x i64> %res
 }
 
 ; CHECK-LABEL: select_eq_v2i64:
 ; CHECK-NEXT: .functype select_eq_v2i64 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x i64> @select_eq_v2i64(i32 %i, <2 x i64> %x, <2 x i64> %y) {
   %c = icmp eq i32 %i, 0
   %res = select i1 %c, <2 x i64> %x, <2 x i64> %y
   ret <2 x i64> %res
 }
 
 ; ==============================================================================
 ; 4 x float
 ; ==============================================================================
 ; CHECK-LABEL: vselect_v4f32:
 ; CHECK-NEXT: .functype vselect_v4f32 (v128, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
 ; CHECK-NEXT: i32x4.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 31{{$}}
 ; CHECK-NEXT: i32x4.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @vselect_v4f32(<4 x i1> %c, <4 x float> %x, <4 x float> %y) {
   %res = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
   ret <4 x float> %res
 }
 
 ; CHECK-LABEL: select_v4f32:
 ; CHECK-NEXT: .functype select_v4f32 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <4 x float> @select_v4f32(i1 %c, <4 x float> %x, <4 x float> %y) {
+define <4 x float> @select_v4f32(i1 zeroext %c, <4 x float> %x, <4 x float> %y) {
   %res = select i1 %c, <4 x float> %x, <4 x float> %y
   ret <4 x float> %res
 }
 
 ; CHECK-LABEL: select_cmp_v4f32:
 ; CHECK-NEXT: .functype select_cmp_v4f32 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 31{{$}}
 ; CHECK-NEXT: i32.shr_s $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32x4.splat $push[[L2:[0-9]+]]=, $pop[[L1]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L2]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @select_cmp_v4f32(i32 %i, <4 x float> %x, <4 x float> %y) {
   %c = icmp slt i32 %i, 0
   %res = select i1 %c, <4 x float> %x, <4 x float> %y
   ret <4 x float> %res
 }
 
 ; CHECK-LABEL: select_ne_v4f32:
 ; CHECK-NEXT: .functype select_ne_v4f32 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @select_ne_v4f32(i32 %i, <4 x float> %x, <4 x float> %y) {
   %c = icmp ne i32 %i, 0
   %res = select i1 %c, <4 x float> %x, <4 x float> %y
   ret <4 x float> %res
 }
 
 ; CHECK-LABEL: select_eq_v4f32:
 ; CHECK-NEXT: .functype select_eq_v4f32 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const $push[[L1:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i32.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i32x4.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <4 x float> @select_eq_v4f32(i32 %i, <4 x float> %x, <4 x float> %y) {
   %c = icmp eq i32 %i, 0
   %res = select i1 %c, <4 x float> %x, <4 x float> %y
   ret <4 x float> %res
 }
 
 ; ==============================================================================
 ; 2 x double
 ; ==============================================================================
 ; CHECK-LABEL: vselect_v2f64:
 ; CHECK-NEXT: .functype vselect_v2f64 (v128, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 63{{$}}
 ; CHECK-NEXT: i64x2.shl $push[[L1:[0-9]+]]=, $0, $pop[[L0]]{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 63{{$}}
 ; CHECK-NEXT: i64x2.shr_s $push[[L3:[0-9]+]]=, $pop[[L1]], $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @vselect_v2f64(<2 x i1> %c, <2 x double> %x, <2 x double> %y) {
   %res = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
   ret <2 x double> %res
 }
 
 ; CHECK-LABEL: select_v2f64:
 ; CHECK-NEXT: .functype select_v2f64 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
-define <2 x double> @select_v2f64(i1 %c, <2 x double> %x, <2 x double> %y) {
+define <2 x double> @select_v2f64(i1 zeroext %c, <2 x double> %x, <2 x double> %y) {
   %res = select i1 %c, <2 x double> %x, <2 x double> %y
   ret <2 x double> %res
 }
 
 ; CHECK-LABEL: select_cmp_v2f64:
 ; CHECK-NEXT: .functype select_cmp_v2f64 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i32.lt_s $push[[L3:[0-9]+]]=, $0, $pop[[L2]]{{$}}
 ; CHECK-NEXT: i64.select $push[[L4:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $pop[[L3]]{{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L5:[0-9]+]]=, $pop[[L4]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L5]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @select_cmp_v2f64(i32 %i, <2 x double> %x, <2 x double> %y) {
   %c = icmp slt i32 %i, 0
   %res = select i1 %c, <2 x double> %x, <2 x double> %y
   ret <2 x double> %res
 }
 
 ; CHECK-LABEL: select_ne_v2f64:
 ; CHECK-NEXT: .functype select_ne_v2f64 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @select_ne_v2f64(i32 %i, <2 x double> %x, <2 x double> %y) {
   %c = icmp ne i32 %i, 0
   %res = select i1 %c, <2 x double> %x, <2 x double> %y
   ret <2 x double> %res
 }
 
 ; CHECK-LABEL: select_eq_v2f64:
 ; CHECK-NEXT: .functype select_eq_v2f64 (i32, v128, v128) -> (v128){{$}}
 ; CHECK-NEXT: i64.const $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: i64.const $push[[L1:[0-9]+]]=, -1{{$}}
 ; CHECK-NEXT: i64.select $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $0{{$}}
 ; CHECK-NEXT: i64x2.splat $push[[L3:[0-9]+]]=, $pop[[L2]]{{$}}
 ; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $1, $2, $pop[[L3]]{{$}}
 ; CHECK-NEXT: return $pop[[R]]{{$}}
 define <2 x double> @select_eq_v2f64(i32 %i, <2 x double> %x, <2 x double> %y) {
   %c = icmp eq i32 %i, 0
   %res = select i1 %c, <2 x double> %x, <2 x double> %y
   ret <2 x double> %res
 }
Index: vendor/llvm/dist-release_80/test/CodeGen/X86/code-model-elf.ll
===================================================================
--- vendor/llvm/dist-release_80/test/CodeGen/X86/code-model-elf.ll	(revision 344764)
+++ vendor/llvm/dist-release_80/test/CodeGen/X86/code-model-elf.ll	(revision 344765)
@@ -1,384 +1,450 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Run with --no_x86_scrub_rip because we care a lot about how globals are
 ; accessed in the code model.
 
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-STATIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-STATIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-STATIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=small  | FileCheck %s --check-prefix=CHECK --check-prefix=SMALL-PIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium | FileCheck %s --check-prefix=CHECK --check-prefix=MEDIUM-PIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
 
 ; Generated from this C source:
 ;
 ; static int static_data[10];
 ; int global_data[10] = {1, 2};
 ; extern int extern_data[10];
 ;
 ; int *lea_static_data() { return &static_data[0]; }
 ; int *lea_global_data() { return &global_data[0]; }
 ; int *lea_extern_data() { return &extern_data[0]; }
 ;
 ; static void static_fn(void) {}
 ; void global_fn(void) {}
 ; void extern_fn(void);
 ;
 ; typedef void (*void_fn)(void);
 ; void_fn lea_static_fn() { return &static_fn; }
 ; void_fn lea_global_fn() { return &global_fn; }
 ; void_fn lea_extern_fn() { return &extern_fn; }
 
 
 ; ModuleID = 'model.c'
 source_filename = "model.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--linux"
 
 @global_data = dso_local global [10 x i32] [i32 1, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0], align 16
 @static_data = internal global [10 x i32] zeroinitializer, align 16
 @extern_data = external global [10 x i32], align 16
+@thread_data = external thread_local global i32, align 4
 
+
 define dso_local i32* @lea_static_data() #0 {
 ; SMALL-STATIC-LABEL: lea_static_data:
 ; SMALL-STATIC:       # %bb.0:
 ; SMALL-STATIC-NEXT:    movl $static_data, %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: lea_static_data:
 ; MEDIUM-STATIC:       # %bb.0:
 ; MEDIUM-STATIC-NEXT:    movabsq $static_data, %rax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: lea_static_data:
 ; LARGE-STATIC:       # %bb.0:
 ; LARGE-STATIC-NEXT:    movabsq $static_data, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_static_data:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    leaq static_data(%rip), %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: lea_static_data:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
 ; MEDIUM-PIC-NEXT:    movabsq $static_data@GOTOFF, %rax
 ; MEDIUM-PIC-NEXT:    addq %rcx, %rax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: lea_static_data:
 ; LARGE-PIC:       # %bb.0:
 ; LARGE-PIC-NEXT:  .L0$pb:
 ; LARGE-PIC-NEXT:    leaq .L0$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $static_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
 ; LARGE-PIC-NEXT:    retq
   ret i32* getelementptr inbounds ([10 x i32], [10 x i32]* @static_data, i64 0, i64 0)
 }
 
 define dso_local i32* @lea_global_data() #0 {
 ; SMALL-STATIC-LABEL: lea_global_data:
 ; SMALL-STATIC:       # %bb.0:
 ; SMALL-STATIC-NEXT:    movl $global_data, %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: lea_global_data:
 ; MEDIUM-STATIC:       # %bb.0:
 ; MEDIUM-STATIC-NEXT:    movabsq $global_data, %rax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: lea_global_data:
 ; LARGE-STATIC:       # %bb.0:
 ; LARGE-STATIC-NEXT:    movabsq $global_data, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_global_data:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    leaq global_data(%rip), %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: lea_global_data:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
 ; MEDIUM-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
 ; MEDIUM-PIC-NEXT:    addq %rcx, %rax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: lea_global_data:
 ; LARGE-PIC:       # %bb.0:
 ; LARGE-PIC-NEXT:  .L1$pb:
 ; LARGE-PIC-NEXT:    leaq .L1$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L1$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
 ; LARGE-PIC-NEXT:    retq
   ret i32* getelementptr inbounds ([10 x i32], [10 x i32]* @global_data, i64 0, i64 0)
 }
 
 define dso_local i32* @lea_extern_data() #0 {
 ; SMALL-STATIC-LABEL: lea_extern_data:
 ; SMALL-STATIC:       # %bb.0:
 ; SMALL-STATIC-NEXT:    movl $extern_data, %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: lea_extern_data:
 ; MEDIUM-STATIC:       # %bb.0:
 ; MEDIUM-STATIC-NEXT:    movabsq $extern_data, %rax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: lea_extern_data:
 ; LARGE-STATIC:       # %bb.0:
 ; LARGE-STATIC-NEXT:    movabsq $extern_data, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_extern_data:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    movq extern_data@GOTPCREL(%rip), %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: lea_extern_data:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    movq extern_data@GOTPCREL(%rip), %rax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: lea_extern_data:
 ; LARGE-PIC:       # %bb.0:
 ; LARGE-PIC-NEXT:  .L2$pb:
 ; LARGE-PIC-NEXT:    leaq .L2$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L2$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rax
 ; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    retq
   ret i32* getelementptr inbounds ([10 x i32], [10 x i32]* @extern_data, i64 0, i64 0)
 }
 
 define dso_local i32 @load_global_data() #0 {
 ; SMALL-STATIC-LABEL: load_global_data:
 ; SMALL-STATIC:       # %bb.0:
 ; SMALL-STATIC-NEXT:    movl global_data+8(%rip), %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: load_global_data:
 ; MEDIUM-STATIC:       # %bb.0:
 ; MEDIUM-STATIC-NEXT:    movabsq $global_data, %rax
 ; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_global_data:
 ; LARGE-STATIC:       # %bb.0:
 ; LARGE-STATIC-NEXT:    movabsq $global_data, %rax
 ; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_global_data:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    movl global_data+8(%rip), %eax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: load_global_data:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
 ; MEDIUM-PIC-NEXT:    movabsq $global_data@GOTOFF, %rcx
 ; MEDIUM-PIC-NEXT:    movl 8(%rax,%rcx), %eax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: load_global_data:
 ; LARGE-PIC:       # %bb.0:
 ; LARGE-PIC-NEXT:  .L3$pb:
 ; LARGE-PIC-NEXT:    leaq .L3$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L3$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $global_data@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    movl 8(%rcx,%rax), %eax
 ; LARGE-PIC-NEXT:    retq
   %rv = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @global_data, i64 0, i64 2)
   ret i32 %rv
 }
 
 define dso_local i32 @load_extern_data() #0 {
 ; SMALL-STATIC-LABEL: load_extern_data:
 ; SMALL-STATIC:       # %bb.0:
 ; SMALL-STATIC-NEXT:    movl extern_data+8(%rip), %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: load_extern_data:
 ; MEDIUM-STATIC:       # %bb.0:
 ; MEDIUM-STATIC-NEXT:    movabsq $extern_data, %rax
 ; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_extern_data:
 ; LARGE-STATIC:       # %bb.0:
 ; LARGE-STATIC-NEXT:    movabsq $extern_data, %rax
 ; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_extern_data:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    movq extern_data@GOTPCREL(%rip), %rax
 ; SMALL-PIC-NEXT:    movl 8(%rax), %eax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: load_extern_data:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    movq extern_data@GOTPCREL(%rip), %rax
 ; MEDIUM-PIC-NEXT:    movl 8(%rax), %eax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: load_extern_data:
 ; LARGE-PIC:       # %bb.0:
 ; LARGE-PIC-NEXT:  .L4$pb:
 ; LARGE-PIC-NEXT:    leaq .L4$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L4$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $extern_data@GOT, %rax
 ; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    movl 8(%rax), %eax
 ; LARGE-PIC-NEXT:    retq
   %rv = load i32, i32* getelementptr inbounds ([10 x i32], [10 x i32]* @extern_data, i64 0, i64 2)
   ret i32 %rv
 }
 
 define dso_local void @global_fn() #0 {
 ; CHECK-LABEL: global_fn:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    retq
   ret void
 }
 
 define internal void @static_fn() #0 {
 ; CHECK-LABEL: static_fn:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    retq
   ret void
 }
 
 declare void @extern_fn()
 
 define dso_local void ()* @lea_static_fn() #0 {
 ; SMALL-STATIC-LABEL: lea_static_fn:
 ; SMALL-STATIC:       # %bb.0:
 ; SMALL-STATIC-NEXT:    movl $static_fn, %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: lea_static_fn:
 ; MEDIUM-STATIC:       # %bb.0:
 ; MEDIUM-STATIC-NEXT:    movabsq $static_fn, %rax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: lea_static_fn:
 ; LARGE-STATIC:       # %bb.0:
 ; LARGE-STATIC-NEXT:    movabsq $static_fn, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_static_fn:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    leaq static_fn(%rip), %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: lea_static_fn:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    movabsq $static_fn, %rax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: lea_static_fn:
 ; LARGE-PIC:       # %bb.0:
 ; LARGE-PIC-NEXT:  .L7$pb:
 ; LARGE-PIC-NEXT:    leaq .L7$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L7$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $static_fn@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
 ; LARGE-PIC-NEXT:    retq
   ret void ()* @static_fn
 }
 
 define dso_local void ()* @lea_global_fn() #0 {
 ; SMALL-STATIC-LABEL: lea_global_fn:
 ; SMALL-STATIC:       # %bb.0:
 ; SMALL-STATIC-NEXT:    movl $global_fn, %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: lea_global_fn:
 ; MEDIUM-STATIC:       # %bb.0:
 ; MEDIUM-STATIC-NEXT:    movabsq $global_fn, %rax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: lea_global_fn:
 ; LARGE-STATIC:       # %bb.0:
 ; LARGE-STATIC-NEXT:    movabsq $global_fn, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_global_fn:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    leaq global_fn(%rip), %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: lea_global_fn:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    movabsq $global_fn, %rax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: lea_global_fn:
 ; LARGE-PIC:       # %bb.0:
 ; LARGE-PIC-NEXT:  .L8$pb:
 ; LARGE-PIC-NEXT:    leaq .L8$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L8$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $global_fn@GOTOFF, %rax
 ; LARGE-PIC-NEXT:    addq %rcx, %rax
 ; LARGE-PIC-NEXT:    retq
   ret void ()* @global_fn
 }
 
 define dso_local void ()* @lea_extern_fn() #0 {
 ; SMALL-STATIC-LABEL: lea_extern_fn:
 ; SMALL-STATIC:       # %bb.0:
 ; SMALL-STATIC-NEXT:    movl $extern_fn, %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: lea_extern_fn:
 ; MEDIUM-STATIC:       # %bb.0:
 ; MEDIUM-STATIC-NEXT:    movabsq $extern_fn, %rax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: lea_extern_fn:
 ; LARGE-STATIC:       # %bb.0:
 ; LARGE-STATIC-NEXT:    movabsq $extern_fn, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_extern_fn:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    movq extern_fn@GOTPCREL(%rip), %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: lea_extern_fn:
 ; MEDIUM-PIC:       # %bb.0:
 ; MEDIUM-PIC-NEXT:    movq extern_fn@GOTPCREL(%rip), %rax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: lea_extern_fn:
 ; LARGE-PIC:       # %bb.0:
 ; LARGE-PIC-NEXT:  .L9$pb:
 ; LARGE-PIC-NEXT:    leaq .L9$pb(%rip), %rax
 ; LARGE-PIC-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L9$pb, %rcx
 ; LARGE-PIC-NEXT:    addq %rax, %rcx
 ; LARGE-PIC-NEXT:    movabsq $extern_fn@GOT, %rax
 ; LARGE-PIC-NEXT:    movq (%rcx,%rax), %rax
 ; LARGE-PIC-NEXT:    retq
   ret void ()* @extern_fn
+}
+
+; FIXME: The result is same for small, medium and large model, because we
+; specify pie option in the test case. And the type of tls is initial exec tls.
+; For pic code. The large model code for pic tls should be emitted as below.
+
+; .L3:
+; leaq	.L3(%rip), %rbx
+; movabsq	$_GLOBAL_OFFSET_TABLE_-.L3, %r11
+; addq	%r11, %rbx
+; leaq	thread_data@TLSGD(%rip), %rdi
+; movabsq	$__tls_get_addr@PLTOFF, %rax
+; addq	%rbx, %rax
+; call	*%rax
+; movl	(%rax), %eax
+
+; The medium and small model code for pic tls should be emitted as below.
+; data16
+; leaq	thread_data@TLSGD(%rip), %rdi
+; data16
+; data16
+; rex64
+; callq	__tls_get_addr@PLT
+; movl	(%rax), %eax
+
+define dso_local i32 @load_thread_data() #0 {
+; SMALL-STATIC-LABEL: load_thread_data:
+; SMALL-STATIC:       # %bb.0:
+; SMALL-STATIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; SMALL-STATIC-NEXT:    movl    %fs:(%rax), %eax
+; SMALL-STATIC-NEXT:    retq
+;
+; MEDIUM-STATIC-LABEL: load_thread_data:
+; MEDIUM-STATIC:       # %bb.0:
+; MEDIUM-STATIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; MEDIUM-STATIC-NEXT:    movl    %fs:(%rax), %eax
+; MEDIUM-STATIC-NEXT:    retq
+;
+; LARGE-STATIC-LABEL: load_thread_data:
+; LARGE-STATIC:       # %bb.0:
+; LARGE-STATIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; LARGE-STATIC-NEXT:    movl    %fs:(%rax), %eax
+; LARGE-STATIC-NEXT:    retq
+;
+; SMALL-PIC-LABEL: load_thread_data:
+; SMALL-PIC:       # %bb.0:
+; SMALL-PIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; SMALL-PIC-NEXT:    movl    %fs:(%rax), %eax
+; SMALL-PIC-NEXT:    retq
+;
+; MEDIUM-PIC-LABEL: load_thread_data:
+; MEDIUM-PIC:       # %bb.0:
+; MEDIUM-PIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; MEDIUM-PIC-NEXT:    movl    %fs:(%rax), %eax
+; MEDIUM-PIC-NEXT:    retq
+;
+; LARGE-PIC-LABEL: load_thread_data:
+; LARGE-PIC:       # %bb.0:
+; LARGE-PIC-NEXT:    movq    thread_data@GOTTPOFF(%rip), %rax
+; LARGE-PIC-NEXT:    movl    %fs:(%rax), %eax
+; LARGE-PIC-NEXT:    retq
+;
+  %1 = load i32, i32* @thread_data, align 4
+  ret i32 %1
 }
 
 attributes #0 = { noinline nounwind uwtable }
 
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"PIC Level", i32 2}
 !2 = !{i32 7, !"PIE Level", i32 2}
 !3 = !{!"clang version 7.0.0 "}
Index: vendor/llvm/dist-release_80/test/CodeGen/X86/pr40891.ll
===================================================================
--- vendor/llvm/dist-release_80/test/CodeGen/X86/pr40891.ll	(nonexistent)
+++ vendor/llvm/dist-release_80/test/CodeGen/X86/pr40891.ll	(revision 344765)
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx2 | FileCheck %s
+
+; Make sure this sequence doesn't hang in DAG combine.
+
+define <8 x i32> @foo(<8 x i64> %x, <4 x i64> %y) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT:    vandps {{\.LCPI.*}}, %ymm1, %ymm1
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retl
+  %a = shufflevector <4 x i64> %y, <4 x i64> <i64 12345, i64 67890, i64 13579, i64 24680>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %b = and <8 x i64> %x, %a
+  %c = trunc <8 x i64> %b to <8 x i32>
+  ret <8 x i32> %c
+}
+
Index: vendor/llvm/dist-release_80/tools/llvm-xray/xray-converter.cpp
===================================================================
--- vendor/llvm/dist-release_80/tools/llvm-xray/xray-converter.cpp	(revision 344764)
+++ vendor/llvm/dist-release_80/tools/llvm-xray/xray-converter.cpp	(revision 344765)
@@ -1,410 +1,423 @@
 //===- xray-converter.cpp: XRay Trace Conversion --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // Implements the trace conversion functions.
 //
 //===----------------------------------------------------------------------===//
 #include "xray-converter.h"
 
 #include "trie-node.h"
 #include "xray-registry.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/JSON.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/XRay/InstrumentationMap.h"
 #include "llvm/XRay/Trace.h"
 #include "llvm/XRay/YAMLXRayRecord.h"
 
 using namespace llvm;
 using namespace xray;
 
 // llvm-xray convert
 // ----------------------------------------------------------------------------
 static cl::SubCommand Convert("convert", "Trace Format Conversion");
 static cl::opt<std::string> ConvertInput(cl::Positional,
                                          cl::desc("<xray log file>"),
                                          cl::Required, cl::sub(Convert));
 enum class ConvertFormats { BINARY, YAML, CHROME_TRACE_EVENT };
 static cl::opt<ConvertFormats> ConvertOutputFormat(
     "output-format", cl::desc("output format"),
     cl::values(clEnumValN(ConvertFormats::BINARY, "raw", "output in binary"),
                clEnumValN(ConvertFormats::YAML, "yaml", "output in yaml"),
                clEnumValN(ConvertFormats::CHROME_TRACE_EVENT, "trace_event",
                           "Output in chrome's trace event format. "
                           "May be visualized with the Catapult trace viewer.")),
     cl::sub(Convert));
 static cl::alias ConvertOutputFormat2("f", cl::aliasopt(ConvertOutputFormat),
                                       cl::desc("Alias for -output-format"),
                                       cl::sub(Convert));
 static cl::opt<std::string>
     ConvertOutput("output", cl::value_desc("output file"), cl::init("-"),
                   cl::desc("output file; use '-' for stdout"),
                   cl::sub(Convert));
 static cl::alias ConvertOutput2("o", cl::aliasopt(ConvertOutput),
                                 cl::desc("Alias for -output"),
                                 cl::sub(Convert));
 
 static cl::opt<bool>
     ConvertSymbolize("symbolize",
                      cl::desc("symbolize function ids from the input log"),
                      cl::init(false), cl::sub(Convert));
 static cl::alias ConvertSymbolize2("y", cl::aliasopt(ConvertSymbolize),
                                    cl::desc("Alias for -symbolize"),
                                    cl::sub(Convert));
 
 static cl::opt<std::string>
     ConvertInstrMap("instr_map",
                     cl::desc("binary with the instrumentation map, or "
                              "a separate instrumentation map"),
                     cl::value_desc("binary with xray_instr_map"),
                     cl::sub(Convert), cl::init(""));
 static cl::alias ConvertInstrMap2("m", cl::aliasopt(ConvertInstrMap),
                                   cl::desc("Alias for -instr_map"),
                                   cl::sub(Convert));
 static cl::opt<bool> ConvertSortInput(
     "sort",
     cl::desc("determines whether to sort input log records by timestamp"),
     cl::sub(Convert), cl::init(true));
 static cl::alias ConvertSortInput2("s", cl::aliasopt(ConvertSortInput),
                                    cl::desc("Alias for -sort"),
                                    cl::sub(Convert));
 
 using llvm::yaml::Output;
 
 void TraceConverter::exportAsYAML(const Trace &Records, raw_ostream &OS) {
   YAMLXRayTrace Trace;
   const auto &FH = Records.getFileHeader();
   Trace.Header = {FH.Version, FH.Type, FH.ConstantTSC, FH.NonstopTSC,
                   FH.CycleFrequency};
   Trace.Records.reserve(Records.size());
   for (const auto &R : Records) {
     Trace.Records.push_back({R.RecordType, R.CPU, R.Type, R.FuncId,
                              Symbolize ? FuncIdHelper.SymbolOrNumber(R.FuncId)
                                        : llvm::to_string(R.FuncId),
                              R.TSC, R.TId, R.PId, R.CallArgs, R.Data});
   }
   Output Out(OS, nullptr, 0);
   Out.setWriteDefaultValues(false);
   Out << Trace;
 }
 
 void TraceConverter::exportAsRAWv1(const Trace &Records, raw_ostream &OS) {
   // First write out the file header, in the correct endian-appropriate format
   // (XRay assumes currently little endian).
   support::endian::Writer Writer(OS, support::endianness::little);
   const auto &FH = Records.getFileHeader();
   Writer.write(FH.Version);
   Writer.write(FH.Type);
   uint32_t Bitfield{0};
   if (FH.ConstantTSC)
     Bitfield |= 1uL;
   if (FH.NonstopTSC)
     Bitfield |= 1uL << 1;
   Writer.write(Bitfield);
   Writer.write(FH.CycleFrequency);
 
   // There's 16 bytes of padding at the end of the file header.
   static constexpr uint32_t Padding4B = 0;
   Writer.write(Padding4B);
   Writer.write(Padding4B);
   Writer.write(Padding4B);
   Writer.write(Padding4B);
 
   // Then write out the rest of the records, still in an endian-appropriate
   // format.
   for (const auto &R : Records) {
     switch (R.Type) {
     case RecordTypes::ENTER:
     case RecordTypes::ENTER_ARG:
       Writer.write(R.RecordType);
       Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{0});
       break;
     case RecordTypes::EXIT:
       Writer.write(R.RecordType);
       Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{1});
       break;
     case RecordTypes::TAIL_EXIT:
       Writer.write(R.RecordType);
       Writer.write(static_cast<uint8_t>(R.CPU));
       Writer.write(uint8_t{2});
       break;
     case RecordTypes::CUSTOM_EVENT:
     case RecordTypes::TYPED_EVENT:
       // Skip custom and typed event records for v1 logs.
       continue;
     }
     Writer.write(R.FuncId);
     Writer.write(R.TSC);
     Writer.write(R.TId);
 
     if (FH.Version >= 3)
       Writer.write(R.PId);
     else
       Writer.write(Padding4B);
 
     Writer.write(Padding4B);
     Writer.write(Padding4B);
   }
 }
 
 namespace {
 
 // A structure that allows building a dictionary of stack ids for the Chrome
 // trace event format.
 struct StackIdData {
   // Each Stack of function calls has a unique ID.
   unsigned id;
 
   // Bookkeeping so that IDs can be maintained uniquely across threads.
   // Traversal keeps sibling pointers to other threads stacks. This is helpful
   // to determine when a thread encounters a new stack and should assign a new
   // unique ID.
   SmallVector<TrieNode<StackIdData> *, 4> siblings;
 };
 
 using StackTrieNode = TrieNode<StackIdData>;
 
 // A helper function to find the sibling nodes for an encountered function in a
 // thread of execution. Relies on the invariant that each time a new node is
 // traversed in a thread, sibling bidirectional pointers are maintained.
 SmallVector<StackTrieNode *, 4>
 findSiblings(StackTrieNode *parent, int32_t FnId, uint32_t TId,
              const DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>>
                  &StackRootsByThreadId) {
 
   SmallVector<StackTrieNode *, 4> Siblings{};
 
   if (parent == nullptr) {
     for (auto map_iter : StackRootsByThreadId) {
       // Only look for siblings in other threads.
       if (map_iter.first != TId)
         for (auto node_iter : map_iter.second) {
           if (node_iter->FuncId == FnId)
             Siblings.push_back(node_iter);
         }
     }
     return Siblings;
   }
 
   for (auto *ParentSibling : parent->ExtraData.siblings)
     for (auto node_iter : ParentSibling->Callees)
       if (node_iter->FuncId == FnId)
         Siblings.push_back(node_iter);
 
   return Siblings;
 }
 
 // Given a function being invoked in a thread with id TId, finds and returns the
 // StackTrie representing the function call stack. If no node exists, creates
 // the node. Assigns unique IDs to stacks newly encountered among all threads
 // and keeps sibling links up to when creating new nodes.
 StackTrieNode *findOrCreateStackNode(
     StackTrieNode *Parent, int32_t FuncId, uint32_t TId,
     DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>> &StackRootsByThreadId,
     DenseMap<unsigned, StackTrieNode *> &StacksByStackId, unsigned *id_counter,
     std::forward_list<StackTrieNode> &NodeStore) {
   SmallVector<StackTrieNode *, 4> &ParentCallees =
       Parent == nullptr ? StackRootsByThreadId[TId] : Parent->Callees;
   auto match = find_if(ParentCallees, [FuncId](StackTrieNode *ParentCallee) {
     return FuncId == ParentCallee->FuncId;
   });
   if (match != ParentCallees.end())
     return *match;
 
   SmallVector<StackTrieNode *, 4> siblings =
       findSiblings(Parent, FuncId, TId, StackRootsByThreadId);
   if (siblings.empty()) {
     NodeStore.push_front({FuncId, Parent, {}, {(*id_counter)++, {}}});
     StackTrieNode *CurrentStack = &NodeStore.front();
     StacksByStackId[*id_counter - 1] = CurrentStack;
     ParentCallees.push_back(CurrentStack);
     return CurrentStack;
   }
   unsigned stack_id = siblings[0]->ExtraData.id;
   NodeStore.push_front({FuncId, Parent, {}, {stack_id, std::move(siblings)}});
   StackTrieNode *CurrentStack = &NodeStore.front();
   for (auto *sibling : CurrentStack->ExtraData.siblings)
     sibling->ExtraData.siblings.push_back(CurrentStack);
   ParentCallees.push_back(CurrentStack);
   return CurrentStack;
 }
 
+void writeTraceViewerRecord(uint16_t Version, raw_ostream &OS, int32_t FuncId,
+                            uint32_t TId, uint32_t PId, bool Symbolize,
+                            const FuncIdConversionHelper &FuncIdHelper,
+                            double EventTimestampUs,
+                            const StackTrieNode &StackCursor,
+                            StringRef FunctionPhenotype) {
+  OS << "    ";
+  if (Version >= 3) {
+    OS << llvm::formatv(
+        R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "{3}", )"
+        R"("ts" : "{4:f4}", "sf" : "{5}" })",
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
+                   : llvm::to_string(FuncId)),
+        FunctionPhenotype, TId, PId, EventTimestampUs,
+        StackCursor.ExtraData.id);
+  } else {
+    OS << llvm::formatv(
+        R"({ "name" : "{0}", "ph" : "{1}", "tid" : "{2}", "pid" : "1", )"
+        R"("ts" : "{3:f3}", "sf" : "{4}" })",
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(FuncId)
+                   : llvm::to_string(FuncId)),
+        FunctionPhenotype, TId, EventTimestampUs, StackCursor.ExtraData.id);
+  }
+}
+
 } // namespace
 
 void TraceConverter::exportAsChromeTraceEventFormat(const Trace &Records,
                                                     raw_ostream &OS) {
   const auto &FH = Records.getFileHeader();
   auto Version = FH.Version;
   auto CycleFreq = FH.CycleFrequency;
 
   unsigned id_counter = 0;
 
+  OS << "{\n  \"traceEvents\": [";
   DenseMap<uint32_t, StackTrieNode *> StackCursorByThreadId{};
   DenseMap<uint32_t, SmallVector<StackTrieNode *, 4>> StackRootsByThreadId{};
   DenseMap<unsigned, StackTrieNode *> StacksByStackId{};
   std::forward_list<StackTrieNode> NodeStore{};
-
-  // Create a JSON Array which will hold all trace events.
-  json::Array TraceEvents;
+  int loop_count = 0;
   for (const auto &R : Records) {
+    if (loop_count++ == 0)
+      OS << "\n";
+    else
+      OS << ",\n";
+
     // Chrome trace event format always wants data in micros.
     // CyclesPerMicro = CycleHertz / 10^6
     // TSC / CyclesPerMicro == TSC * 10^6 / CycleHertz == MicroTimestamp
     // Could lose some precision here by converting the TSC to a double to
     // multiply by the period in micros. 52 bit mantissa is a good start though.
     // TODO: Make feature request to Chrome Trace viewer to accept ticks and a
     // frequency or do some more involved calculation to avoid dangers of
     // conversion.
     double EventTimestampUs = double(1000000) / CycleFreq * double(R.TSC);
     StackTrieNode *&StackCursor = StackCursorByThreadId[R.TId];
     switch (R.Type) {
     case RecordTypes::CUSTOM_EVENT:
     case RecordTypes::TYPED_EVENT:
       // TODO: Support typed and custom event rendering on Chrome Trace Viewer.
       break;
     case RecordTypes::ENTER:
     case RecordTypes::ENTER_ARG:
       StackCursor = findOrCreateStackNode(StackCursor, R.FuncId, R.TId,
                                           StackRootsByThreadId, StacksByStackId,
                                           &id_counter, NodeStore);
       // Each record is represented as a json dictionary with function name,
       // type of B for begin or E for end, thread id, process id,
       // timestamp in microseconds, and a stack frame id. The ids are logged
       // in an id dictionary after the events.
-      TraceEvents.push_back(json::Object({
-          {"name", Symbolize ? FuncIdHelper.SymbolOrNumber(R.FuncId)
-                             : llvm::to_string(R.FuncId)},
-          {"ph", "B"},
-          {"tid", llvm::to_string(R.TId)},
-          {"pid", llvm::to_string(Version >= 3 ? R.PId : 1)},
-          {"ts", llvm::formatv("{0:f4}", EventTimestampUs)},
-          {"sf", llvm::to_string(StackCursor->ExtraData.id)},
-      }));
+      writeTraceViewerRecord(Version, OS, R.FuncId, R.TId, R.PId, Symbolize,
+                             FuncIdHelper, EventTimestampUs, *StackCursor, "B");
       break;
     case RecordTypes::EXIT:
     case RecordTypes::TAIL_EXIT:
       // No entries to record end for.
       if (StackCursor == nullptr)
         break;
       // Should we emit an END record anyway or account this condition?
       // (And/Or in loop termination below)
       StackTrieNode *PreviousCursor = nullptr;
       do {
-        TraceEvents.push_back(json::Object({
-            {"name", Symbolize
-                         ? FuncIdHelper.SymbolOrNumber(StackCursor->FuncId)
-                         : llvm::to_string(StackCursor->FuncId)},
-            {"ph", "E"},
-            {"tid", llvm::to_string(R.TId)},
-            {"pid", llvm::to_string(Version >= 3 ? R.PId : 1)},
-            {"ts", llvm::formatv("{0:f4}", EventTimestampUs)},
-            {"sf", llvm::to_string(StackCursor->ExtraData.id)},
-        }));
+        if (PreviousCursor != nullptr) {
+          OS << ",\n";
+        }
+        writeTraceViewerRecord(Version, OS, StackCursor->FuncId, R.TId, R.PId,
+                               Symbolize, FuncIdHelper, EventTimestampUs,
+                               *StackCursor, "E");
         PreviousCursor = StackCursor;
         StackCursor = StackCursor->Parent;
       } while (PreviousCursor->FuncId != R.FuncId && StackCursor != nullptr);
       break;
     }
   }
+  OS << "\n  ],\n"; // Close the Trace Events array.
+  OS << "  "
+     << "\"displayTimeUnit\": \"ns\",\n";
 
   // The stackFrames dictionary substantially reduces size of the output file by
   // avoiding repeating the entire call stack of function names for each entry.
-  json::Object StackFrames;
-  for (const auto &Stack : StacksByStackId) {
-    const auto &StackId = Stack.first;
-    const auto &StackFunctionNode = Stack.second;
-    json::Object::iterator It;
-    std::tie(It, std::ignore) = StackFrames.insert({
-        llvm::to_string(StackId),
-        json::Object{
-            {"name",
-             Symbolize ? FuncIdHelper.SymbolOrNumber(StackFunctionNode->FuncId)
-                       : llvm::to_string(StackFunctionNode->FuncId)}},
-    });
-
-    if (StackFunctionNode->Parent != nullptr)
-      It->second.getAsObject()->insert(
-          {"parent", llvm::to_string(StackFunctionNode->Parent->ExtraData.id)});
+  OS << R"(  "stackFrames": {)";
+  int stack_frame_count = 0;
+  for (auto map_iter : StacksByStackId) {
+    if (stack_frame_count++ == 0)
+      OS << "\n";
+    else
+      OS << ",\n";
+    OS << "    ";
+    OS << llvm::formatv(
+        R"("{0}" : { "name" : "{1}")", map_iter.first,
+        (Symbolize ? FuncIdHelper.SymbolOrNumber(map_iter.second->FuncId)
+                   : llvm::to_string(map_iter.second->FuncId)));
+    if (map_iter.second->Parent != nullptr)
+      OS << llvm::formatv(R"(, "parent": "{0}")",
+                          map_iter.second->Parent->ExtraData.id);
+    OS << " }";
   }
-
-  json::Object TraceJSON{
-      {"displayTimeUnit", "ns"},
-      {"traceEvents", std::move(TraceEvents)},
-      {"stackFrames", std::move(StackFrames)},
-  };
-
-  // Pretty-print the JSON using two spaces for indentations.
-  OS << formatv("{0:2}", json::Value(std::move(TraceJSON)));
+  OS << "\n  }\n"; // Close the stack frames map.
+  OS << "}\n";     // Close the JSON entry.
 }
 
 namespace llvm {
 namespace xray {
 
 static CommandRegistration Unused(&Convert, []() -> Error {
   // FIXME: Support conversion to BINARY when upgrading XRay trace versions.
   InstrumentationMap Map;
   if (!ConvertInstrMap.empty()) {
     auto InstrumentationMapOrError = loadInstrumentationMap(ConvertInstrMap);
     if (!InstrumentationMapOrError)
       return joinErrors(make_error<StringError>(
                             Twine("Cannot open instrumentation map '") +
                                 ConvertInstrMap + "'",
                             std::make_error_code(std::errc::invalid_argument)),
                         InstrumentationMapOrError.takeError());
     Map = std::move(*InstrumentationMapOrError);
   }
 
   const auto &FunctionAddresses = Map.getFunctionAddresses();
   symbolize::LLVMSymbolizer::Options Opts(
       symbolize::FunctionNameKind::LinkageName, true, true, false, "");
   symbolize::LLVMSymbolizer Symbolizer(Opts);
   llvm::xray::FuncIdConversionHelper FuncIdHelper(ConvertInstrMap, Symbolizer,
                                                   FunctionAddresses);
   llvm::xray::TraceConverter TC(FuncIdHelper, ConvertSymbolize);
   std::error_code EC;
   raw_fd_ostream OS(ConvertOutput, EC,
                     ConvertOutputFormat == ConvertFormats::BINARY
                         ? sys::fs::OpenFlags::F_None
                         : sys::fs::OpenFlags::F_Text);
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + ConvertOutput + "' for writing.", EC);
 
   auto TraceOrErr = loadTraceFile(ConvertInput, ConvertSortInput);
   if (!TraceOrErr)
     return joinErrors(
         make_error<StringError>(
             Twine("Failed loading input file '") + ConvertInput + "'.",
             std::make_error_code(std::errc::executable_format_error)),
         TraceOrErr.takeError());
 
   auto &T = *TraceOrErr;
   switch (ConvertOutputFormat) {
   case ConvertFormats::YAML:
     TC.exportAsYAML(T, OS);
     break;
   case ConvertFormats::BINARY:
     TC.exportAsRAWv1(T, OS);
     break;
   case ConvertFormats::CHROME_TRACE_EVENT:
     TC.exportAsChromeTraceEventFormat(T, OS);
     break;
   }
   return Error::success();
 });
 
 } // namespace xray
 } // namespace llvm