diff --git a/include/Makefile.am b/include/Makefile.am index 19726bba1864..1e5c71150eeb 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -1,200 +1,201 @@ if BUILD_LINUX include $(srcdir)/%D%/os/linux/Makefile.am endif if BUILD_FREEBSD include $(srcdir)/%D%/os/freebsd/Makefile.am endif COMMON_H = \ cityhash.h \ zfeature_common.h \ zfs_comutil.h \ zfs_deleg.h \ zfs_fletcher.h \ zfs_namecheck.h \ zfs_prop.h \ \ sys/abd.h \ sys/abd_impl.h \ sys/aggsum.h \ sys/arc.h \ sys/arc_impl.h \ + sys/asm_linkage.h \ sys/avl.h \ sys/avl_impl.h \ sys/bitops.h \ sys/blake3.h \ sys/blkptr.h \ sys/bplist.h \ sys/bpobj.h \ sys/bptree.h \ sys/bqueue.h \ sys/btree.h \ sys/dataset_kstats.h \ sys/dbuf.h \ sys/ddt.h \ sys/dmu.h \ sys/dmu_impl.h \ sys/dmu_objset.h \ sys/dmu_recv.h \ sys/dmu_redact.h \ sys/dmu_send.h \ sys/dmu_traverse.h \ sys/dmu_tx.h \ sys/dmu_zfetch.h \ sys/dnode.h \ sys/dsl_bookmark.h \ sys/dsl_crypt.h \ sys/dsl_dataset.h \ sys/dsl_deadlist.h \ sys/dsl_deleg.h \ sys/dsl_destroy.h \ sys/dsl_dir.h \ sys/dsl_pool.h \ sys/dsl_prop.h \ sys/dsl_scan.h \ sys/dsl_synctask.h \ sys/dsl_userhold.h \ sys/edonr.h \ sys/efi_partition.h \ sys/frame.h \ sys/hkdf.h \ sys/metaslab.h \ sys/metaslab_impl.h \ sys/mmp.h \ sys/mntent.h \ sys/mod.h \ sys/multilist.h \ sys/nvpair.h \ sys/nvpair_impl.h \ sys/objlist.h \ sys/pathname.h \ sys/qat.h \ sys/range_tree.h \ sys/rrwlock.h \ sys/sa.h \ sys/sa_impl.h \ sys/skein.h \ sys/spa.h \ sys/spa_checkpoint.h \ sys/spa_checksum.h \ sys/spa_impl.h \ sys/spa_log_spacemap.h \ sys/space_map.h \ sys/space_reftree.h \ sys/sysevent.h \ sys/txg.h \ sys/txg_impl.h \ sys/u8_textprep.h \ sys/u8_textprep_data.h \ sys/uberblock.h \ sys/uberblock_impl.h \ sys/uio_impl.h \ sys/unique.h \ sys/uuid.h \ sys/vdev.h \ sys/vdev_disk.h \ sys/vdev_draid.h \ sys/vdev_file.h \ sys/vdev_impl.h \ sys/vdev_indirect_births.h \ sys/vdev_indirect_mapping.h \ sys/vdev_initialize.h \ sys/vdev_raidz.h \ sys/vdev_raidz_impl.h \ sys/vdev_rebuild.h \ sys/vdev_removal.h \ sys/vdev_trim.h \ sys/xvattr.h \ sys/zap.h \ sys/zap_impl.h \ sys/zap_leaf.h \ sys/zcp.h \ sys/zcp_global.h \ sys/zcp_iter.h \ sys/zcp_prop.h \ sys/zcp_set.h \ sys/zfeature.h \ sys/zfs_acl.h \ sys/zfs_bootenv.h \ sys/zfs_chksum.h \ sys/zfs_context.h \ sys/zfs_debug.h \ sys/zfs_delay.h \ sys/zfs_file.h \ sys/zfs_fuid.h \ sys/zfs_project.h \ sys/zfs_quota.h \ sys/zfs_racct.h \ sys/zfs_ratelimit.h \ sys/zfs_refcount.h \ sys/zfs_rlock.h \ sys/zfs_sa.h \ sys/zfs_stat.h \ sys/zfs_sysfs.h \ sys/zfs_vfsops.h \ sys/zfs_vnops.h \ sys/zfs_znode.h \ sys/zil.h \ sys/zil_impl.h \ sys/zio.h \ sys/zio_checksum.h \ sys/zio_compress.h \ sys/zio_crypt.h \ sys/zio_impl.h \ sys/zio_priority.h \ sys/zrlock.h \ sys/zthr.h \ \ sys/crypto/api.h \ sys/crypto/common.h \ sys/crypto/icp.h \ \ sys/fm/protocol.h \ sys/fm/util.h \ sys/fm/fs/zfs.h \ \ sys/fs/zfs.h \ \ sys/lua/lauxlib.h \ sys/lua/lua.h \ sys/lua/luaconf.h \ sys/lua/lualib.h \ \ sys/sysevent/dev.h \ sys/sysevent/eventdefs.h \ \ sys/zstd/zstd.h KERNEL_H = \ sys/zfs_ioctl.h \ sys/zfs_ioctl_impl.h \ sys/zfs_onexit.h \ sys/zvol.h \ sys/zvol_impl.h USER_H = \ libnvpair.h \ libuutil.h \ libuutil_common.h \ libuutil_impl.h \ libzfs.h \ libzfs_core.h \ libzfsbootenv.h \ libzutil.h \ thread_pool.h if CONFIG_USER libzfsdir = $(includedir)/libzfs nobase_libzfs_HEADERS = $(COMMON_H) $(USER_H) endif kerneldir = $(prefix)/src/zfs-$(VERSION)/include if CONFIG_KERNEL if BUILD_LINUX nobase_kernel_HEADERS = $(COMMON_H) $(KERNEL_H) endif endif diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am index a750f52e7d25..89d4ef564d5f 100644 --- a/include/os/freebsd/Makefile.am +++ b/include/os/freebsd/Makefile.am @@ -1,90 +1,93 @@ noinst_HEADERS = \ %D%/linux/compiler.h \ %D%/linux/types.h \ \ %D%/spl/acl/acl_common.h \ \ %D%/spl/rpc/xdr.h \ \ + %D%/spl/sys/ia32/asm_linkage.h \ + \ %D%/spl/sys/acl.h \ %D%/spl/sys/acl_impl.h \ %D%/spl/sys/atomic.h \ %D%/spl/sys/byteorder.h \ %D%/spl/sys/callb.h \ %D%/spl/sys/ccompat.h \ %D%/spl/sys/ccompile.h \ %D%/spl/sys/cmn_err.h \ %D%/spl/sys/condvar.h \ %D%/spl/sys/cred.h \ %D%/spl/sys/ctype.h \ %D%/spl/sys/debug.h \ %D%/spl/sys/dirent.h \ %D%/spl/sys/disp.h \ %D%/spl/sys/dkio.h \ %D%/spl/sys/fcntl.h \ %D%/spl/sys/file.h \ %D%/spl/sys/freebsd_rwlock.h \ %D%/spl/sys/idmap.h \ %D%/spl/sys/inttypes.h \ %D%/spl/sys/isa_defs.h \ %D%/spl/sys/kmem.h \ %D%/spl/sys/kmem_cache.h \ %D%/spl/sys/kstat.h \ %D%/spl/sys/list.h \ %D%/spl/sys/list_impl.h \ %D%/spl/sys/lock.h \ %D%/spl/sys/misc.h \ %D%/spl/sys/mod_os.h \ %D%/spl/sys/mode.h \ %D%/spl/sys/mount.h \ %D%/spl/sys/mutex.h \ %D%/spl/sys/param.h \ %D%/spl/sys/policy.h \ %D%/spl/sys/proc.h \ %D%/spl/sys/processor.h \ %D%/spl/sys/procfs_list.h \ %D%/spl/sys/random.h \ %D%/spl/sys/rwlock.h \ %D%/spl/sys/sdt.h \ %D%/spl/sys/sid.h \ %D%/spl/sys/sig.h \ %D%/spl/sys/simd.h \ %D%/spl/sys/simd_powerpc.h \ %D%/spl/sys/simd_x86.h \ %D%/spl/sys/spl_condvar.h \ %D%/spl/sys/string.h \ %D%/spl/sys/sunddi.h \ %D%/spl/sys/sysmacros.h \ %D%/spl/sys/systeminfo.h \ %D%/spl/sys/systm.h \ %D%/spl/sys/taskq.h \ %D%/spl/sys/thread.h \ %D%/spl/sys/time.h \ %D%/spl/sys/timer.h \ %D%/spl/sys/trace.h \ %D%/spl/sys/trace_zfs.h \ %D%/spl/sys/types.h \ %D%/spl/sys/types32.h \ %D%/spl/sys/uio.h \ %D%/spl/sys/uuid.h \ %D%/spl/sys/vfs.h \ %D%/spl/sys/vm.h \ %D%/spl/sys/vmsystm.h \ %D%/spl/sys/vnode.h \ %D%/spl/sys/vnode_impl.h \ %D%/spl/sys/wmsum.h \ %D%/spl/sys/zmod.h \ %D%/spl/sys/zone.h \ \ %D%/zfs/sys/freebsd_crypto.h \ %D%/zfs/sys/sha2.h \ %D%/zfs/sys/vdev_os.h \ %D%/zfs/sys/zfs_bootenv_os.h \ %D%/zfs/sys/zfs_context_os.h \ %D%/zfs/sys/zfs_ctldir.h \ %D%/zfs/sys/zfs_dir.h \ %D%/zfs/sys/zfs_ioctl_compat.h \ %D%/zfs/sys/zfs_vfsops_os.h \ %D%/zfs/sys/zfs_vnops_os.h \ %D%/zfs/sys/zfs_znode_impl.h \ %D%/zfs/sys/zpl.h + diff --git a/module/icp/include/sys/ia32/asm_linkage.h b/include/os/freebsd/spl/sys/ia32/asm_linkage.h similarity index 84% copy from module/icp/include/sys/ia32/asm_linkage.h copy to include/os/freebsd/spl/sys/ia32/asm_linkage.h index e3e769ffd858..bbbd22030213 100644 --- a/module/icp/include/sys/ia32/asm_linkage.h +++ b/include/os/freebsd/spl/sys/ia32/asm_linkage.h @@ -1,194 +1,178 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H -#include -#include - -#if defined(_KERNEL) && defined(__linux__) -#include -#endif - -#ifndef ENDBR -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -/* CSTYLED */ -#if __has_include() - -#include - -#ifdef _CET_ENDBR -#define ENDBR _CET_ENDBR -#endif /* _CET_ENDBR */ +#define RET ret -#endif /* */ -#endif /* __ELF__ && __CET__ && __has_include */ -#endif /* !ENDBR */ +/* Tell compiler to call assembler like Unix */ +#undef ASMABI +#define ASMABI __attribute__((sysv_abi)) -#ifndef ENDBR #define ENDBR -#endif -#ifndef RET -#define RET ret -#endif + +#define SECTION_TEXT .text +#define SECTION_STATIC .data #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ + /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ -#undef ENTRY #define ENTRY(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ - .type x, @function; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ - .type x, @function; \ +x: + +#define ENTRY_ALIGN(x, a) \ + .text; \ + .align a; \ + .globl x; \ x: /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ - .type x, @function; \ - .type y, @function; \ x:; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ - .type x, @function; \ - .type y, @function; \ x:; \ y: /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ -#define SET_SIZE(x) \ - .size x, [.-x] +#define SET_SIZE(x) + +#define SET_OBJ(x) + #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index 13ba8060c62d..e20702d332ac 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -1,112 +1,116 @@ if CONFIG_KERNEL kernel_linuxdir = $(kerneldir)/linux kernel_linux_HEADERS = \ %D%/kernel/linux/blkdev_compat.h \ %D%/kernel/linux/compiler_compat.h \ %D%/kernel/linux/dcache_compat.h \ %D%/kernel/linux/kmap_compat.h \ %D%/kernel/linux/mod_compat.h \ %D%/kernel/linux/page_compat.h \ %D%/kernel/linux/percpu_compat.h \ %D%/kernel/linux/simd.h \ %D%/kernel/linux/simd_aarch64.h \ %D%/kernel/linux/simd_powerpc.h \ %D%/kernel/linux/simd_x86.h \ %D%/kernel/linux/utsname_compat.h \ %D%/kernel/linux/vfs_compat.h \ %D%/kernel/linux/xattr_compat.h kernel_sysdir = $(kerneldir)/sys kernel_sys_HEADERS = \ %D%/zfs/sys/policy.h \ %D%/zfs/sys/sha2.h \ %D%/zfs/sys/trace_acl.h \ %D%/zfs/sys/trace_arc.h \ %D%/zfs/sys/trace_common.h \ %D%/zfs/sys/trace_dbgmsg.h \ %D%/zfs/sys/trace_dbuf.h \ %D%/zfs/sys/trace_dmu.h \ %D%/zfs/sys/trace_dnode.h \ %D%/zfs/sys/trace_multilist.h \ %D%/zfs/sys/trace_rrwlock.h \ %D%/zfs/sys/trace_txg.h \ %D%/zfs/sys/trace_vdev.h \ %D%/zfs/sys/trace_zfs.h \ %D%/zfs/sys/trace_zil.h \ %D%/zfs/sys/trace_zio.h \ %D%/zfs/sys/trace_zrlock.h \ %D%/zfs/sys/zfs_bootenv_os.h \ %D%/zfs/sys/zfs_context_os.h \ %D%/zfs/sys/zfs_ctldir.h \ %D%/zfs/sys/zfs_dir.h \ %D%/zfs/sys/zfs_vfsops_os.h \ %D%/zfs/sys/zfs_vnops_os.h \ %D%/zfs/sys/zfs_znode_impl.h \ %D%/zfs/sys/zpl.h kernel_spl_rpcdir = $(kerneldir)/spl/rpc kernel_spl_rpc_HEADERS = \ %D%/spl/rpc/xdr.h kernel_spl_sysdir = $(kerneldir)/spl/sys kernel_spl_sys_HEADERS = \ %D%/spl/sys/acl.h \ %D%/spl/sys/atomic.h \ %D%/spl/sys/byteorder.h \ %D%/spl/sys/callb.h \ %D%/spl/sys/callo.h \ %D%/spl/sys/cmn_err.h \ %D%/spl/sys/condvar.h \ %D%/spl/sys/cred.h \ %D%/spl/sys/ctype.h \ %D%/spl/sys/debug.h \ %D%/spl/sys/disp.h \ %D%/spl/sys/dkio.h \ %D%/spl/sys/errno.h \ %D%/spl/sys/fcntl.h \ %D%/spl/sys/file.h \ %D%/spl/sys/inttypes.h \ %D%/spl/sys/isa_defs.h \ %D%/spl/sys/kmem.h \ %D%/spl/sys/kmem_cache.h \ %D%/spl/sys/kstat.h \ %D%/spl/sys/list.h \ %D%/spl/sys/misc.h \ %D%/spl/sys/mod_os.h \ %D%/spl/sys/mutex.h \ %D%/spl/sys/param.h \ %D%/spl/sys/proc.h \ %D%/spl/sys/processor.h \ %D%/spl/sys/procfs_list.h \ %D%/spl/sys/random.h \ %D%/spl/sys/rwlock.h \ %D%/spl/sys/shrinker.h \ %D%/spl/sys/sid.h \ %D%/spl/sys/signal.h \ %D%/spl/sys/simd.h \ %D%/spl/sys/stat.h \ %D%/spl/sys/string.h \ %D%/spl/sys/sunddi.h \ %D%/spl/sys/sysmacros.h \ %D%/spl/sys/systeminfo.h \ %D%/spl/sys/taskq.h \ %D%/spl/sys/thread.h \ %D%/spl/sys/time.h \ %D%/spl/sys/timer.h \ %D%/spl/sys/trace.h \ %D%/spl/sys/trace_spl.h \ %D%/spl/sys/trace_taskq.h \ %D%/spl/sys/tsd.h \ %D%/spl/sys/types.h \ %D%/spl/sys/types32.h \ %D%/spl/sys/uio.h \ %D%/spl/sys/user.h \ %D%/spl/sys/vfs.h \ %D%/spl/sys/vmem.h \ %D%/spl/sys/vmsystm.h \ %D%/spl/sys/vnode.h \ %D%/spl/sys/wait.h \ %D%/spl/sys/wmsum.h \ %D%/spl/sys/zmod.h \ %D%/spl/sys/zone.h + +kernel_spl_ia32dir = $(kernel_spl_sysdir)/ia32 +kernel_spl_ia32_HEADERS = \ + %D%/spl/sys/ia32/asm_linkage.h endif diff --git a/module/icp/include/sys/ia32/asm_linkage.h b/include/os/linux/spl/sys/ia32/asm_linkage.h similarity index 92% copy from module/icp/include/sys/ia32/asm_linkage.h copy to include/os/linux/spl/sys/ia32/asm_linkage.h index e3e769ffd858..2864d9455129 100644 --- a/module/icp/include/sys/ia32/asm_linkage.h +++ b/include/os/linux/spl/sys/ia32/asm_linkage.h @@ -1,194 +1,212 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H -#include -#include - #if defined(_KERNEL) && defined(__linux__) #include #endif #ifndef ENDBR #if defined(__ELF__) && defined(__CET__) && defined(__has_include) /* CSTYLED */ #if __has_include() #include #ifdef _CET_ENDBR #define ENDBR _CET_ENDBR #endif /* _CET_ENDBR */ #endif /* */ #endif /* __ELF__ && __CET__ && __has_include */ #endif /* !ENDBR */ #ifndef ENDBR #define ENDBR #endif #ifndef RET #define RET ret #endif +/* You can set to nothing on Unix platforms */ +#undef ASMABI +#define ASMABI __attribute__((sysv_abi)) + +#define SECTION_TEXT .text +#define SECTION_STATIC .section .rodata + #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ #undef ENTRY #define ENTRY(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: +#define ENTRY_ALIGN(x, a) \ + .text; \ + .align a; \ + .globl x; \ + .type x, @function; \ +x: + +#define FUNCTION(x) \ + .type x, @function; \ +x: + /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ x:; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ x:; \ y: /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ #define SET_SIZE(x) \ .size x, [.-x] +#define SET_OBJ(x) .type x, @object + + #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/module/icp/include/sys/asm_linkage.h b/include/sys/asm_linkage.h similarity index 98% copy from module/icp/include/sys/asm_linkage.h copy to include/sys/asm_linkage.h index 84aa0854a9ff..749157d4c3db 100644 --- a/module/icp/include/sys/asm_linkage.h +++ b/include/sys/asm_linkage.h @@ -1,46 +1,48 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ASM_LINKAGE_H #define _SYS_ASM_LINKAGE_H +#define ASMABI + #if defined(__i386) || defined(__amd64) #include /* XX64 x86/sys/asm_linkage.h */ #endif #if defined(_KERNEL) && defined(HAVE_KERNEL_OBJTOOL) #include #else /* userspace */ #define FRAME_BEGIN #define FRAME_END #endif #endif /* _SYS_ASM_LINKAGE_H */ diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index b7f1d0e1b1e4..7c6cf71de242 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -1,69 +1,70 @@ libicp_la_CCASFLAGS = $(AM_CCASFLAGS) libicp_la_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS) noinst_LTLIBRARIES += libicp.la nodist_libicp_la_SOURCES = \ module/icp/spi/kcf_spi.c \ module/icp/api/kcf_ctxops.c \ module/icp/api/kcf_cipher.c \ module/icp/api/kcf_mac.c \ module/icp/algs/aes/aes_impl_aesni.c \ module/icp/algs/aes/aes_impl_generic.c \ module/icp/algs/aes/aes_impl_x86-64.c \ module/icp/algs/aes/aes_impl.c \ module/icp/algs/aes/aes_modes.c \ module/icp/algs/blake3/blake3.c \ module/icp/algs/blake3/blake3_generic.c \ module/icp/algs/blake3/blake3_impl.c \ module/icp/algs/blake3/blake3_x86-64.c \ module/icp/algs/edonr/edonr.c \ module/icp/algs/modes/modes.c \ module/icp/algs/modes/cbc.c \ module/icp/algs/modes/gcm_generic.c \ module/icp/algs/modes/gcm_pclmulqdq.c \ module/icp/algs/modes/gcm.c \ module/icp/algs/modes/ctr.c \ module/icp/algs/modes/ccm.c \ module/icp/algs/modes/ecb.c \ module/icp/algs/sha2/sha2.c \ module/icp/algs/skein/skein.c \ module/icp/algs/skein/skein_block.c \ module/icp/algs/skein/skein_iv.c \ module/icp/illumos-crypto.c \ module/icp/io/aes.c \ module/icp/io/sha2_mod.c \ module/icp/io/skein_mod.c \ module/icp/core/kcf_sched.c \ module/icp/core/kcf_prov_lib.c \ module/icp/core/kcf_callprov.c \ module/icp/core/kcf_mech_tabs.c \ module/icp/core/kcf_prov_tabs.c if TARGET_CPU_AARCH64 nodist_libicp_la_SOURCES += \ module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \ module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S endif if TARGET_CPU_POWERPC nodist_libicp_la_SOURCES += \ module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S \ module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S endif if TARGET_CPU_X86_64 nodist_libicp_la_SOURCES += \ module/icp/asm-x86_64/aes/aeskey.c \ module/icp/asm-x86_64/aes/aes_amd64.S \ module/icp/asm-x86_64/aes/aes_aesni.S \ module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \ module/icp/asm-x86_64/modes/ghash-x86_64.S \ module/icp/asm-x86_64/sha2/sha256_impl.S \ module/icp/asm-x86_64/sha2/sha512_impl.S \ module/icp/asm-x86_64/blake3/blake3_avx2.S \ module/icp/asm-x86_64/blake3/blake3_avx512.S \ module/icp/asm-x86_64/blake3/blake3_sse2.S \ module/icp/asm-x86_64/blake3/blake3_sse41.S endif + diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am index 6f0e1818d22e..c8b41bbc296e 100644 --- a/lib/libspl/include/Makefile.am +++ b/lib/libspl/include/Makefile.am @@ -1,94 +1,104 @@ libspldir = $(includedir)/libspl libspl_HEADERS = \ %D%/assert.h \ %D%/atomic.h \ %D%/libgen.h \ %D%/libshare.h \ %D%/statcommon.h \ %D%/stdlib.h \ %D%/string.h \ %D%/umem.h \ %D%/unistd.h \ %D%/zone.h if BUILD_FREEBSD libspl_HEADERS += \ %D%/os/freebsd/fcntl.h endif libspl_rpcdir = $(libspldir)/rpc libspl_rpc_HEADERS = \ %D%/rpc/xdr.h libspl_sysdir = $(libspldir)/sys libspl_sys_HEADERS = \ %D%/sys/acl.h \ %D%/sys/acl_impl.h \ + %D%/sys/asm_linkage.h \ %D%/sys/callb.h \ %D%/sys/cmn_err.h \ %D%/sys/cred.h \ %D%/sys/debug.h \ %D%/sys/dkio.h \ %D%/sys/dklabel.h \ %D%/sys/feature_tests.h \ %D%/sys/inttypes.h \ %D%/sys/isa_defs.h \ %D%/sys/kmem.h \ %D%/sys/kstat.h \ %D%/sys/list.h \ %D%/sys/list_impl.h \ %D%/sys/mhd.h \ %D%/sys/mkdev.h \ %D%/sys/policy.h \ %D%/sys/poll.h \ %D%/sys/priv.h \ %D%/sys/processor.h \ %D%/sys/sha2.h \ %D%/sys/simd.h \ %D%/sys/stack.h \ %D%/sys/stdtypes.h \ %D%/sys/string.h \ %D%/sys/sunddi.h \ %D%/sys/systeminfo.h \ %D%/sys/time.h \ %D%/sys/trace_spl.h \ %D%/sys/trace_zfs.h \ %D%/sys/types.h \ %D%/sys/types32.h \ %D%/sys/uio.h \ %D%/sys/vnode.h \ %D%/sys/wmsum.h \ %D%/sys/zone.h +libspl_ia32dir = $(libspldir)/sys/ia32 + if BUILD_LINUX libspl_sys_HEADERS += \ %D%/os/linux/sys/byteorder.h \ %D%/os/linux/sys/errno.h \ %D%/os/linux/sys/mnttab.h \ %D%/os/linux/sys/mount.h \ %D%/os/linux/sys/param.h \ %D%/os/linux/sys/stat.h \ %D%/os/linux/sys/sysmacros.h \ %D%/os/linux/sys/zfs_context_os.h + +libspl_ia32_HEADERS = \ + %D%/os/linux/sys/ia32/asm_linkage.h endif if BUILD_FREEBSD libspl_sys_HEADERS += \ %D%/os/freebsd/sys/byteorder.h \ %D%/os/freebsd/sys/fcntl.h \ %D%/os/freebsd/sys/file.h \ %D%/os/freebsd/sys/mnttab.h \ %D%/os/freebsd/sys/mount.h \ %D%/os/freebsd/sys/param.h \ %D%/os/freebsd/sys/stat.h \ %D%/os/freebsd/sys/sysmacros.h \ %D%/os/freebsd/sys/vfs.h \ %D%/os/freebsd/sys/zfs_context_os.h + +libspl_ia32_HEADERS = \ + %D%/os/freebsd/sys/ia32/asm_linkage.h endif libspl_sys_dktpdir = $(libspl_sysdir)/dktp libspl_sys_dktp_HEADERS = \ %D%/sys/dktp/fdisk.h + diff --git a/module/icp/include/sys/ia32/asm_linkage.h b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h similarity index 84% copy from module/icp/include/sys/ia32/asm_linkage.h copy to lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h index e3e769ffd858..3b4beecc5d34 100644 --- a/module/icp/include/sys/ia32/asm_linkage.h +++ b/lib/libspl/include/os/freebsd/sys/ia32/asm_linkage.h @@ -1,194 +1,184 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H -#include -#include - -#if defined(_KERNEL) && defined(__linux__) -#include +#if defined(__linux__) && defined(CONFIG_SLS) +#define RET ret; int3 +#else +#define RET ret #endif -#ifndef ENDBR -#if defined(__ELF__) && defined(__CET__) && defined(__has_include) -/* CSTYLED */ -#if __has_include() - -#include - -#ifdef _CET_ENDBR -#define ENDBR _CET_ENDBR -#endif /* _CET_ENDBR */ +/* Tell compiler to call assembler like Unix */ +#undef ASMABI +#define ASMABI __attribute__((sysv_abi)) -#endif /* */ -#endif /* __ELF__ && __CET__ && __has_include */ -#endif /* !ENDBR */ - -#ifndef ENDBR #define ENDBR -#endif -#ifndef RET -#define RET ret -#endif + +#define SECTION_TEXT .text +#define SECTION_STATIC .data #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ -#undef ENTRY #define ENTRY(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ - .type x, @function; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ - .type x, @function; \ +x: + +#define ENTRY_ALIGN(x, a) \ + .text; \ + .align a; \ + .globl x; \ +x: + +#define FUNCTION(x) \ + .type x, @function; \ x: /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ - .type x, @function; \ - .type y, @function; \ x:; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ - .type x, @function; \ - .type y, @function; \ x:; \ y: /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ -#define SET_SIZE(x) \ - .size x, [.-x] +#define SET_SIZE(x) + +#define SET_OBJ(x) #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/module/icp/include/sys/ia32/asm_linkage.h b/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h similarity index 92% rename from module/icp/include/sys/ia32/asm_linkage.h rename to lib/libspl/include/os/linux/sys/ia32/asm_linkage.h index e3e769ffd858..76765dd040cf 100644 --- a/module/icp/include/sys/ia32/asm_linkage.h +++ b/lib/libspl/include/os/linux/sys/ia32/asm_linkage.h @@ -1,194 +1,211 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _IA32_SYS_ASM_LINKAGE_H #define _IA32_SYS_ASM_LINKAGE_H -#include -#include - #if defined(_KERNEL) && defined(__linux__) #include #endif #ifndef ENDBR #if defined(__ELF__) && defined(__CET__) && defined(__has_include) /* CSTYLED */ #if __has_include() #include #ifdef _CET_ENDBR #define ENDBR _CET_ENDBR #endif /* _CET_ENDBR */ #endif /* */ #endif /* __ELF__ && __CET__ && __has_include */ #endif /* !ENDBR */ #ifndef ENDBR #define ENDBR #endif #ifndef RET #define RET ret #endif +/* You can set to nothing on Unix platforms */ +#undef ASMABI +#define ASMABI __attribute__((sysv_abi)) + +#define SECTION_TEXT .text +#define SECTION_STATIC .section .rodata + #ifdef __cplusplus extern "C" { #endif #ifdef _ASM /* The remainder of this file is only for assembly files */ /* * make annoying differences in assembler syntax go away */ /* * D16 and A16 are used to insert instructions prefixes; the * macros help the assembler code be slightly more portable. */ #if !defined(__GNUC_AS__) /* * /usr/ccs/bin/as prefixes are parsed as separate instructions */ #define D16 data16; #define A16 addr16; /* * (There are some weird constructs in constant expressions) */ #define _CONST(const) [const] #define _BITNOT(const) -1!_CONST(const) #define _MUL(a, b) _CONST(a \* b) #else /* * Why not use the 'data16' and 'addr16' prefixes .. well, the * assembler doesn't quite believe in real mode, and thus argues with * us about what we're trying to do. */ #define D16 .byte 0x66; #define A16 .byte 0x67; #define _CONST(const) (const) #define _BITNOT(const) ~_CONST(const) #define _MUL(a, b) _CONST(a * b) #endif /* * C pointers are different sizes between i386 and amd64. * These constants can be used to compute offsets into pointer arrays. */ #if defined(__amd64) #define CLONGSHIFT 3 #define CLONGSIZE 8 #define CLONGMASK 7 #elif defined(__i386) #define CLONGSHIFT 2 #define CLONGSIZE 4 #define CLONGMASK 3 #endif /* * Since we know we're either ILP32 or LP64 .. */ #define CPTRSHIFT CLONGSHIFT #define CPTRSIZE CLONGSIZE #define CPTRMASK CLONGMASK #if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) #error "inconsistent shift constants" #endif #if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) #error "inconsistent mask constants" #endif #define ASM_ENTRY_ALIGN 16 /* * SSE register alignment and save areas */ #define XMM_SIZE 16 #define XMM_ALIGN 16 /* * ENTRY provides the standard procedure entry code and an easy way to * insert the calls to mcount for profiling. ENTRY_NP is identical, but * never calls mcount. */ #undef ENTRY #define ENTRY(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: MCOUNT(x) #define ENTRY_NP(x) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x; \ .type x, @function; \ x: +#define ENTRY_ALIGN(x, a) \ + .text; \ + .align a; \ + .globl x; \ + .type x, @function; \ +x: + +#define FUNCTION(x) \ + .type x, @function; \ +x: + /* * ENTRY2 is identical to ENTRY but provides two labels for the entry point. */ #define ENTRY2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ x:; \ y: MCOUNT(x) #define ENTRY_NP2(x, y) \ .text; \ .align ASM_ENTRY_ALIGN; \ .globl x, y; \ .type x, @function; \ .type y, @function; \ x:; \ y: /* * SET_SIZE trails a function and set the size for the ELF symbol table. */ #define SET_SIZE(x) \ .size x, [.-x] +#define SET_OBJ(x) .type x, @object + #endif /* _ASM */ #ifdef __cplusplus } #endif #endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/module/icp/include/sys/asm_linkage.h b/lib/libspl/include/sys/asm_linkage.h similarity index 100% rename from module/icp/include/sys/asm_linkage.h rename to lib/libspl/include/sys/asm_linkage.h diff --git a/module/Kbuild.in b/module/Kbuild.in index a39f9d9d0500..a1ea08cd4348 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -1,475 +1,475 @@ # When integrated in to a monolithic kernel the spl module must appear # first. This ensures its module initialization function is run before # any of the other module initialization functions which depend on it. ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement ZFS_MODULE_CFLAGS += -Wmissing-prototypes ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ ifneq ($(KBUILD_EXTMOD),) zfs_include = @abs_top_srcdir@/include icp_include = @abs_srcdir@/icp/include zstd_include = @abs_srcdir@/zstd/include ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include src = @abs_srcdir@ obj = @abs_builddir@ else zfs_include = $(srctree)/include/zfs icp_include = $(srctree)/$(src)/icp/include zstd_include = $(srctree)/$(src)/zstd/include ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h endif ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs ZFS_MODULE_CFLAGS += -I$(zfs_include) ZFS_MODULE_CPPFLAGS += -D_KERNEL ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ # KASAN enables -Werror=frame-larger-than=1024, which # breaks oh so many parts of our build. ifeq ($(CONFIG_KASAN),y) ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= endif ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ endif asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) ifeq ($(CONFIG_ARM64),y) CFLAGS_REMOVE_zcommon/zfs_fletcher_aarch64_neon.o += -mgeneral-regs-only CFLAGS_REMOVE_zfs/vdev_raidz_math_aarch64_neon.o += -mgeneral-regs-only CFLAGS_REMOVE_zfs/vdev_raidz_math_aarch64_neonx2.o += -mgeneral-regs-only endif # Suppress unused-value warnings in sparc64 architecture headers ccflags-$(CONFIG_SPARC64) += -Wno-unused-value obj-$(CONFIG_ZFS) := spl.o zfs.o SPL_OBJS := \ spl-atomic.o \ spl-condvar.o \ spl-cred.o \ spl-err.o \ spl-generic.o \ spl-kmem-cache.o \ spl-kmem.o \ spl-kstat.o \ spl-proc.o \ spl-procfs-list.o \ spl-taskq.o \ spl-thread.o \ spl-trace.o \ spl-tsd.o \ spl-vmem.o \ spl-xdr.o \ spl-zlib.o \ spl-zone.o spl-objs += $(addprefix os/linux/spl/,$(SPL_OBJS)) zfs-objs += avl/avl.o ICP_OBJS := \ algs/aes/aes_impl.o \ algs/aes/aes_impl_generic.o \ algs/aes/aes_modes.o \ algs/blake3/blake3.o \ algs/blake3/blake3_generic.o \ algs/blake3/blake3_impl.o \ algs/blake3/blake3_x86-64.o \ algs/edonr/edonr.o \ algs/modes/cbc.o \ algs/modes/ccm.o \ algs/modes/ctr.o \ algs/modes/ecb.o \ algs/modes/gcm.o \ algs/modes/gcm_generic.o \ algs/modes/modes.o \ algs/sha2/sha2.o \ algs/skein/skein.o \ algs/skein/skein_block.o \ algs/skein/skein_iv.o \ api/kcf_cipher.o \ api/kcf_ctxops.o \ api/kcf_mac.o \ core/kcf_callprov.o \ core/kcf_mech_tabs.o \ core/kcf_prov_lib.o \ core/kcf_prov_tabs.o \ core/kcf_sched.o \ illumos-crypto.o \ io/aes.o \ io/sha2_mod.o \ io/skein_mod.o \ spi/kcf_spi.o ICP_OBJS_X86_64 := \ asm-x86_64/aes/aes_aesni.o \ asm-x86_64/aes/aes_amd64.o \ asm-x86_64/aes/aeskey.o \ asm-x86_64/blake3/blake3_avx2.o \ asm-x86_64/blake3/blake3_avx512.o \ asm-x86_64/blake3/blake3_sse2.o \ asm-x86_64/blake3/blake3_sse41.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ asm-x86_64/modes/ghash-x86_64.o \ asm-x86_64/sha2/sha256_impl.o \ asm-x86_64/sha2/sha512_impl.o ICP_OBJS_X86 := \ algs/aes/aes_impl_aesni.o \ algs/aes/aes_impl_x86-64.o \ algs/modes/gcm_pclmulqdq.o ICP_OBJS_ARM64 := \ asm-aarch64/blake3/b3_aarch64_sse2.o \ asm-aarch64/blake3/b3_aarch64_sse41.o ICP_OBJS_PPC_PPC64 := \ asm-ppc64/blake3/b3_ppc64le_sse2.o \ asm-ppc64/blake3/b3_ppc64le_sse41.o zfs-objs += $(addprefix icp/,$(ICP_OBJS)) zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64)) zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ - $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include) + $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include) -I$(zfs_include)/os/linux/spl -I$(zfs_include) $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ - $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) + $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) -I$(zfs_include)/os/linux/spl -I$(zfs_include) # Suppress objtool "return with modified stack frame" warnings. OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y # Suppress objtool "unsupported stack pointer realignment" warnings. We are # not using a DRAP register while aligning the stack to a 64 byte boundary. # See #6950 for the reasoning. OBJECT_FILES_NON_STANDARD_sha256_impl.o := y OBJECT_FILES_NON_STANDARD_sha512_impl.o := y LUA_OBJS := \ lapi.o \ lauxlib.o \ lbaselib.o \ lcode.o \ lcompat.o \ lcorolib.o \ lctype.o \ ldebug.o \ ldo.o \ lfunc.o \ lgc.o \ llex.o \ lmem.o \ lobject.o \ lopcodes.o \ lparser.o \ lstate.o \ lstring.o \ lstrlib.o \ ltable.o \ ltablib.o \ ltm.o \ lvm.o \ lzio.o \ setjmp/setjmp.o zfs-objs += $(addprefix lua/,$(LUA_OBJS)) NVPAIR_OBJS := \ fnvpair.o \ nvpair.o \ nvpair_alloc_fixed.o \ nvpair_alloc_spl.o zfs-objs += $(addprefix nvpair/,$(NVPAIR_OBJS)) UNICODE_OBJS := \ u8_textprep.o \ uconv.o zfs-objs += $(addprefix unicode/,$(UNICODE_OBJS)) ZCOMMON_OBJS := \ cityhash.o \ zfeature_common.o \ zfs_comutil.o \ zfs_deleg.o \ zfs_fletcher.o \ zfs_fletcher_superscalar.o \ zfs_fletcher_superscalar4.o \ zfs_namecheck.o \ zfs_prop.o \ zpool_prop.o \ zprop_common.o ZCOMMON_OBJS_X86 := \ zfs_fletcher_avx512.o \ zfs_fletcher_intel.o \ zfs_fletcher_sse.o ZCOMMON_OBJS_ARM64 := \ zfs_fletcher_aarch64_neon.o zfs-objs += $(addprefix zcommon/,$(ZCOMMON_OBJS)) zfs-$(CONFIG_X86) += $(addprefix zcommon/,$(ZCOMMON_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix zcommon/,$(ZCOMMON_OBJS_X86)) zfs-$(CONFIG_ARM64) += $(addprefix zcommon/,$(ZCOMMON_OBJS_ARM64)) # Zstd uses -O3 by default, so we should follow ZFS_ZSTD_FLAGS := -O3 # -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h # Set it for other compilers, too. ZFS_ZSTD_FLAGS += -fno-tree-vectorize # SSE register return with SSE disabled if -march=znverX is passed ZFS_ZSTD_FLAGS += -U__BMI__ # Quiet warnings about frame size due to unused code in unmodified zstd lib ZFS_ZSTD_FLAGS += -Wframe-larger-than=20480 ZSTD_OBJS := \ zfs_zstd.o \ zstd_sparc.o ZSTD_UPSTREAM_OBJS := \ lib/common/entropy_common.o \ lib/common/error_private.o \ lib/common/fse_decompress.o \ lib/common/pool.o \ lib/common/zstd_common.o \ lib/compress/fse_compress.o \ lib/compress/hist.o \ lib/compress/huf_compress.o \ lib/compress/zstd_compress.o \ lib/compress/zstd_compress_literals.o \ lib/compress/zstd_compress_sequences.o \ lib/compress/zstd_compress_superblock.o \ lib/compress/zstd_double_fast.o \ lib/compress/zstd_fast.o \ lib/compress/zstd_lazy.o \ lib/compress/zstd_ldm.o \ lib/compress/zstd_opt.o \ lib/decompress/huf_decompress.o \ lib/decompress/zstd_ddict.o \ lib/decompress/zstd_decompress.o \ lib/decompress/zstd_decompress_block.o zfs-objs += $(addprefix zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) # Disable aarch64 neon SIMD instructions for kernel mode $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -I$(zstd_include) $(ZFS_ZSTD_FLAGS) $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : asflags-y += -I$(zstd_include) $(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w $(obj)/zstd/zfs_zstd.o : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h ZFS_OBJS := \ abd.o \ aggsum.o \ arc.o \ blake3_zfs.o \ blkptr.o \ bplist.o \ bpobj.o \ bptree.o \ bqueue.o \ btree.o \ dataset_kstats.o \ dbuf.o \ dbuf_stats.o \ ddt.o \ ddt_zap.o \ dmu.o \ dmu_diff.o \ dmu_object.o \ dmu_objset.o \ dmu_recv.o \ dmu_redact.o \ dmu_send.o \ dmu_traverse.o \ dmu_tx.o \ dmu_zfetch.o \ dnode.o \ dnode_sync.o \ dsl_bookmark.o \ dsl_crypt.o \ dsl_dataset.o \ dsl_deadlist.o \ dsl_deleg.o \ dsl_destroy.o \ dsl_dir.o \ dsl_pool.o \ dsl_prop.o \ dsl_scan.o \ dsl_synctask.o \ dsl_userhold.o \ edonr_zfs.o \ fm.o \ gzip.o \ hkdf.o \ lz4.o \ lz4_zfs.o \ lzjb.o \ metaslab.o \ mmp.o \ multilist.o \ objlist.o \ pathname.o \ range_tree.o \ refcount.o \ rrwlock.o \ sa.o \ sha256.o \ skein_zfs.o \ spa.o \ spa_checkpoint.o \ spa_config.o \ spa_errlog.o \ spa_history.o \ spa_log_spacemap.o \ spa_misc.o \ spa_stats.o \ space_map.o \ space_reftree.o \ txg.o \ uberblock.o \ unique.o \ vdev.o \ vdev_cache.o \ vdev_draid.o \ vdev_draid_rand.o \ vdev_indirect.o \ vdev_indirect_births.o \ vdev_indirect_mapping.o \ vdev_initialize.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ vdev_queue.o \ vdev_raidz.o \ vdev_raidz_math.o \ vdev_raidz_math_scalar.o \ vdev_rebuild.o \ vdev_removal.o \ vdev_root.o \ vdev_trim.o \ zap.o \ zap_leaf.o \ zap_micro.o \ zcp.o \ zcp_get.o \ zcp_global.o \ zcp_iter.o \ zcp_set.o \ zcp_synctask.o \ zfeature.o \ zfs_byteswap.o \ zfs_chksum.o \ zfs_fm.o \ zfs_fuid.o \ zfs_ioctl.o \ zfs_log.o \ zfs_onexit.o \ zfs_quota.o \ zfs_ratelimit.o \ zfs_replay.o \ zfs_rlock.o \ zfs_sa.o \ zfs_vnops.o \ zil.o \ zio.o \ zio_checksum.o \ zio_compress.o \ zio_inject.o \ zle.o \ zrlock.o \ zthr.o \ zvol.o ZFS_OBJS_OS := \ abd_os.o \ arc_os.o \ mmp_os.o \ policy.o \ qat.o \ qat_compress.o \ qat_crypt.o \ spa_misc_os.o \ trace.o \ vdev_disk.o \ vdev_file.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_debug.o \ zfs_dir.o \ zfs_file_os.o \ zfs_ioctl_os.o \ zfs_racct.o \ zfs_sysfs.o \ zfs_uio.o \ zfs_vfsops.o \ zfs_vnops_os.o \ zfs_znode.o \ zio_crypt.o \ zpl_ctldir.o \ zpl_export.o \ zpl_file.o \ zpl_inode.o \ zpl_super.o \ zpl_xattr.o \ zvol_os.o ZFS_OBJS_X86 := \ vdev_raidz_math_avx2.o \ vdev_raidz_math_avx512bw.o \ vdev_raidz_math_avx512f.o \ vdev_raidz_math_sse2.o \ vdev_raidz_math_ssse3.o ZFS_OBJS_ARM64 := \ vdev_raidz_math_aarch64_neon.o \ vdev_raidz_math_aarch64_neonx2.o ZFS_OBJS_PPC_PPC64 := \ vdev_raidz_math_powerpc_altivec.o zfs-objs += $(addprefix zfs/,$(ZFS_OBJS)) $(addprefix os/linux/zfs/,$(ZFS_OBJS_OS)) zfs-$(CONFIG_X86) += $(addprefix zfs/,$(ZFS_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix zfs/,$(ZFS_OBJS_X86)) zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y ifeq ($(CONFIG_ALTIVEC),y) $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c index 5bc1bf92dad4..61085214c77b 100644 --- a/module/icp/algs/aes/aes_impl_aesni.c +++ b/module/icp/algs/aes/aes_impl_aesni.c @@ -1,124 +1,125 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ #if defined(__x86_64) && defined(HAVE_AES) #include #include +#include /* These functions are used to execute AES-NI instructions: */ -extern int rijndael_key_setup_enc_intel(uint32_t rk[], +extern ASMABI int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits); -extern int rijndael_key_setup_dec_intel(uint32_t rk[], +extern ASMABI int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], uint64_t keyBits); -extern void aes_encrypt_intel(const uint32_t rk[], int Nr, +extern ASMABI void aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); -extern void aes_decrypt_intel(const uint32_t rk[], int Nr, +extern ASMABI void aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); #include /* * Expand the 32-bit AES cipher key array into the encryption and decryption * key schedules. * * Parameters: * key AES key schedule to be initialized * keyarr32 User key * keyBits AES key size (128, 192, or 256 bits) */ static void aes_aesni_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits) { kfpu_begin(); key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]), keyarr32, keybits); key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]), keyarr32, keybits); kfpu_end(); } /* * Encrypt one block of data. The block is assumed to be an array * of four uint32_t values, so copy for alignment (and byte-order * reversal for little endian systems might be necessary on the * input and output byte streams. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4*(Nr + 1). * * Parameters: * rk Key schedule, of aes_ks_t (60 32-bit integers) * Nr Number of rounds * pt Input block (plain text) * ct Output block (crypto text). Can overlap with pt */ static void aes_aesni_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]) { kfpu_begin(); aes_encrypt_intel(rk, Nr, pt, ct); kfpu_end(); } /* * Decrypt one block of data. The block is assumed to be an array * of four uint32_t values, so copy for alignment (and byte-order * reversal for little endian systems might be necessary on the * input and output byte streams. * The size of the key schedule depends on the number of rounds * (which can be computed from the size of the key), i.e. 4*(Nr + 1). * * Parameters: * rk Key schedule, of aes_ks_t (60 32-bit integers) * Nr Number of rounds * ct Input block (crypto text) * pt Output block (plain text). Can overlap with pt */ static void aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]) { kfpu_begin(); aes_decrypt_intel(rk, Nr, ct, pt); kfpu_end(); } static boolean_t aes_aesni_will_work(void) { return (kfpu_allowed() && zfs_aes_available()); } const aes_impl_ops_t aes_aesni_impl = { .generate = &aes_aesni_generate, .encrypt = &aes_aesni_encrypt, .decrypt = &aes_aesni_decrypt, .is_supported = &aes_aesni_will_work, .needs_byteswap = B_FALSE, .name = "aesni" }; #endif /* defined(__x86_64) && defined(HAVE_AES) */ diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h index eef74eaa9098..ecb51e3a3010 100644 --- a/module/icp/algs/blake3/blake3_impl.h +++ b/module/icp/algs/blake3/blake3_impl.h @@ -1,213 +1,214 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor * Copyright (c) 2021-2022 Tino Reichardt */ #ifndef BLAKE3_IMPL_H #define BLAKE3_IMPL_H #ifdef __cplusplus extern "C" { #endif #include #include #include +#include /* * Methods used to define BLAKE3 assembler implementations */ typedef void (*blake3_compress_in_place_f)(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); typedef void (*blake3_compress_xof_f)(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); typedef boolean_t (*blake3_is_supported_f)(void); typedef struct blake3_impl_ops { blake3_compress_in_place_f compress_in_place; blake3_compress_xof_f compress_xof; blake3_hash_many_f hash_many; blake3_is_supported_f is_supported; int degree; const char *name; } blake3_ops_t; /* Return selected BLAKE3 implementation ops */ extern const blake3_ops_t *blake3_impl_get_ops(void); extern const blake3_ops_t blake3_generic_impl; #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) extern const blake3_ops_t blake3_sse2_impl; #endif #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) extern const blake3_ops_t blake3_sse41_impl; #endif #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) extern const blake3_ops_t blake3_avx2_impl; #endif #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) extern const blake3_ops_t blake3_avx512_impl; #endif #if defined(__x86_64) #define MAX_SIMD_DEGREE 16 #else #define MAX_SIMD_DEGREE 4 #endif #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) static const uint32_t BLAKE3_IV[8] = { 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; static const uint8_t BLAKE3_MSG_SCHEDULE[7][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, }; /* Find index of the highest set bit */ static inline unsigned int highest_one(uint64_t x) { #if defined(__GNUC__) || defined(__clang__) return (63 ^ __builtin_clzll(x)); #elif defined(_MSC_VER) && defined(IS_X86_64) unsigned long index; _BitScanReverse64(&index, x); return (index); #elif defined(_MSC_VER) && defined(IS_X86_32) if (x >> 32) { unsigned long index; _BitScanReverse(&index, x >> 32); return (32 + index); } else { unsigned long index; _BitScanReverse(&index, x); return (index); } #else unsigned int c = 0; if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } if (x & 0x000000000000000cULL) { x >>= 2; c += 2; } if (x & 0x0000000000000002ULL) { c += 1; } return (c); #endif } /* Count the number of 1 bits. */ static inline unsigned int popcnt(uint64_t x) { unsigned int count = 0; while (x != 0) { count += 1; x &= x - 1; } return (count); } /* * Largest power of two less than or equal to x. * As a special case, returns 1 when x is 0. */ static inline uint64_t round_down_to_power_of_2(uint64_t x) { return (1ULL << highest_one(x | 1)); } static inline uint32_t counter_low(uint64_t counter) { return ((uint32_t)counter); } static inline uint32_t counter_high(uint64_t counter) { return ((uint32_t)(counter >> 32)); } static inline uint32_t load32(const void *src) { const uint8_t *p = (const uint8_t *)src; return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); } static inline void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], uint32_t key_words[8]) { key_words[0] = load32(&key[0 * 4]); key_words[1] = load32(&key[1 * 4]); key_words[2] = load32(&key[2 * 4]); key_words[3] = load32(&key[3 * 4]); key_words[4] = load32(&key[4 * 4]); key_words[5] = load32(&key[5 * 4]); key_words[6] = load32(&key[6 * 4]); key_words[7] = load32(&key[7 * 4]); } static inline void store32(void *dst, uint32_t w) { uint8_t *p = (uint8_t *)dst; p[0] = (uint8_t)(w >> 0); p[1] = (uint8_t)(w >> 8); p[2] = (uint8_t)(w >> 16); p[3] = (uint8_t)(w >> 24); } static inline void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { store32(&bytes_out[0 * 4], cv_words[0]); store32(&bytes_out[1 * 4], cv_words[1]); store32(&bytes_out[2 * 4], cv_words[2]); store32(&bytes_out[3 * 4], cv_words[3]); store32(&bytes_out[4 * 4], cv_words[4]); store32(&bytes_out[5 * 4], cv_words[5]); store32(&bytes_out[6 * 4], cv_words[6]); store32(&bytes_out[7 * 4], cv_words[7]); } #ifdef __cplusplus } #endif #endif /* BLAKE3_IMPL_H */ diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c index 8139789fd779..04a8b3333656 100644 --- a/module/icp/algs/blake3/blake3_x86-64.c +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -1,248 +1,248 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2021-2022 Tino Reichardt */ #include "blake3_impl.h" #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); static void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { kfpu_begin(); zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, flags); kfpu_end(); } static void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { kfpu_begin(); zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out); kfpu_end(); } static void blake3_hash_many_sse2(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { kfpu_begin(); zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); kfpu_end(); } static boolean_t blake3_is_sse2_supported(void) { #if defined(__x86_64) return (kfpu_allowed() && zfs_sse2_available()); #elif defined(__PPC64__) && defined(__linux__) return (kfpu_allowed() && zfs_vsx_available()); #else return (kfpu_allowed()); #endif } const blake3_ops_t blake3_sse2_impl = { .compress_in_place = blake3_compress_in_place_sse2, .compress_xof = blake3_compress_xof_sse2, .hash_many = blake3_hash_many_sse2, .is_supported = blake3_is_sse2_supported, .degree = 4, .name = "sse2" }; #endif #if defined(__aarch64__) || \ (defined(__x86_64) && defined(HAVE_SSE2)) || \ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) -extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); static void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { kfpu_begin(); zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, flags); kfpu_end(); } static void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { kfpu_begin(); zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out); kfpu_end(); } static void blake3_hash_many_sse41(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { kfpu_begin(); zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); kfpu_end(); } static boolean_t blake3_is_sse41_supported(void) { #if defined(__x86_64) return (kfpu_allowed() && zfs_sse4_1_available()); #elif defined(__PPC64__) && defined(__linux__) return (kfpu_allowed() && zfs_vsx_available()); #else return (kfpu_allowed()); #endif } const blake3_ops_t blake3_sse41_impl = { .compress_in_place = blake3_compress_in_place_sse41, .compress_xof = blake3_compress_xof_sse41, .hash_many = blake3_hash_many_sse41, .is_supported = blake3_is_sse41_supported, .degree = 4, .name = "sse41" }; #endif #if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) -extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); static void blake3_hash_many_avx2(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { kfpu_begin(); zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); kfpu_end(); } static boolean_t blake3_is_avx2_supported(void) { return (kfpu_allowed() && zfs_sse4_1_available() && zfs_avx2_available()); } const blake3_ops_t blake3_avx2_impl = { .compress_in_place = blake3_compress_in_place_sse41, .compress_xof = blake3_compress_xof_sse41, .hash_many = blake3_hash_many_avx2, .is_supported = blake3_is_avx2_supported, .degree = 8, .name = "avx2" }; #endif #if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) -extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags); -extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], +extern void ASMABI zfs_blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]); -extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, +extern void ASMABI zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); static void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags) { kfpu_begin(); zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, flags); kfpu_end(); } static void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64]) { kfpu_begin(); zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out); kfpu_end(); } static void blake3_hash_many_avx512(const uint8_t * const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, boolean_t increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out) { kfpu_begin(); zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); kfpu_end(); } static boolean_t blake3_is_avx512_supported(void) { return (kfpu_allowed() && zfs_avx512f_available() && zfs_avx512vl_available()); } const blake3_ops_t blake3_avx512_impl = { .compress_in_place = blake3_compress_in_place_avx512, .compress_xof = blake3_compress_xof_avx512, .hash_many = blake3_hash_many_avx512, .is_supported = blake3_is_avx512_supported, .degree = 16, .name = "avx512" }; #endif diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c index a8792c4555e2..472ec4bc9e13 100644 --- a/module/icp/algs/modes/gcm.c +++ b/module/icp/algs/modes/gcm.c @@ -1,1596 +1,1596 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #ifdef CAN_USE_GCM_ASM #include #endif #define GHASH(c, d, t, o) \ xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ (uint64_t *)(void *)(t)); /* Select GCM implementation */ #define IMPL_FASTEST (UINT32_MAX) #define IMPL_CYCLE (UINT32_MAX-1) #ifdef CAN_USE_GCM_ASM #define IMPL_AVX (UINT32_MAX-2) #endif #define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) static uint32_t icp_gcm_impl = IMPL_FASTEST; static uint32_t user_sel_impl = IMPL_FASTEST; #ifdef CAN_USE_GCM_ASM /* Does the architecture we run on support the MOVBE instruction? */ boolean_t gcm_avx_can_use_movbe = B_FALSE; /* * Whether to use the optimized openssl gcm and ghash implementations. * Set to true if module parameter icp_gcm_impl == "avx". */ static boolean_t gcm_use_avx = B_FALSE; #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) -extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); +extern boolean_t ASMABI atomic_toggle_boolean_nv(volatile boolean_t *); static inline boolean_t gcm_avx_will_work(void); static inline void gcm_set_avx(boolean_t); static inline boolean_t gcm_toggle_avx(void); static inline size_t gcm_simd_get_htab_size(boolean_t); static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, crypto_data_t *, size_t); static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *, size_t, size_t); #endif /* ifdef CAN_USE_GCM_ASM */ /* * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode * is done in another function. */ int gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_mode_encrypt_contiguous_blocks_avx( ctx, data, length, out, block_size)); #endif const gcm_impl_ops_t *gops; size_t remainder = length; size_t need = 0; uint8_t *datap = (uint8_t *)data; uint8_t *blockp; uint8_t *lastp; void *iov_or_mp; offset_t offset; uint8_t *out_data_1; uint8_t *out_data_2; size_t out_data_1_len; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); if (length + ctx->gcm_remainder_len < block_size) { /* accumulate bytes here and return */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, length); ctx->gcm_remainder_len += length; if (ctx->gcm_copy_to == NULL) { ctx->gcm_copy_to = datap; } return (CRYPTO_SUCCESS); } crypto_init_ptrs(out, &iov_or_mp, &offset); gops = gcm_impl_get_ops(); do { /* Unprocessed data from last call. */ if (ctx->gcm_remainder_len > 0) { need = block_size - ctx->gcm_remainder_len; if (need > remainder) return (CRYPTO_DATA_LEN_RANGE); memcpy(&((uint8_t *)ctx->gcm_remainder) [ctx->gcm_remainder_len], datap, need); blockp = (uint8_t *)ctx->gcm_remainder; } else { blockp = datap; } /* * Increment counter. Counter bits are confined * to the bottom 32 bits of the counter block. */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, (uint8_t *)ctx->gcm_tmp); xor_block(blockp, (uint8_t *)ctx->gcm_tmp); lastp = (uint8_t *)ctx->gcm_tmp; ctx->gcm_processed_data_len += block_size; crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, &out_data_1_len, &out_data_2, block_size); /* copy block to where it belongs */ if (out_data_1_len == block_size) { copy_block(lastp, out_data_1); } else { memcpy(out_data_1, lastp, out_data_1_len); if (out_data_2 != NULL) { memcpy(out_data_2, lastp + out_data_1_len, block_size - out_data_1_len); } } /* update offset */ out->cd_offset += block_size; /* add ciphertext to the hash */ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); /* Update pointer to next block of data to be processed. */ if (ctx->gcm_remainder_len != 0) { datap += need; ctx->gcm_remainder_len = 0; } else { datap += block_size; } remainder = (size_t)&data[length] - (size_t)datap; /* Incomplete last block. */ if (remainder > 0 && remainder < block_size) { memcpy(ctx->gcm_remainder, datap, remainder); ctx->gcm_remainder_len = remainder; ctx->gcm_copy_to = datap; goto out; } ctx->gcm_copy_to = NULL; } while (remainder > 0); out: return (CRYPTO_SUCCESS); } int gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { (void) copy_block; #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_encrypt_final_avx(ctx, out, block_size)); #endif const gcm_impl_ops_t *gops; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint8_t *ghash, *macp = NULL; int i, rv; if (out->cd_length < (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { return (CRYPTO_DATA_LEN_RANGE); } gops = gcm_impl_get_ops(); ghash = (uint8_t *)ctx->gcm_ghash; if (ctx->gcm_remainder_len > 0) { uint64_t counter; uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; /* * Here is where we deal with data that is not a * multiple of the block size. */ /* * Increment counter. */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, (uint8_t *)ctx->gcm_tmp); macp = (uint8_t *)ctx->gcm_remainder; memset(macp + ctx->gcm_remainder_len, 0, block_size - ctx->gcm_remainder_len); /* XOR with counter block */ for (i = 0; i < ctx->gcm_remainder_len; i++) { macp[i] ^= tmpp[i]; } /* add ciphertext to the hash */ GHASH(ctx, macp, ghash, gops); ctx->gcm_processed_data_len += ctx->gcm_remainder_len; } ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, (uint8_t *)ctx->gcm_J0); xor_block((uint8_t *)ctx->gcm_J0, ghash); if (ctx->gcm_remainder_len > 0) { rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); if (rv != CRYPTO_SUCCESS) return (rv); } out->cd_offset += ctx->gcm_remainder_len; ctx->gcm_remainder_len = 0; rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += ctx->gcm_tag_len; return (CRYPTO_SUCCESS); } /* * This will only deal with decrypting the last block of the input that * might not be a multiple of block length. */ static void gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { uint8_t *datap, *outp, *counterp; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); int i; /* * Increment counter. * Counter bits are confined to the bottom 32 bits */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; datap = (uint8_t *)ctx->gcm_remainder; outp = &((ctx->gcm_pt_buf)[index]); counterp = (uint8_t *)ctx->gcm_tmp; /* authentication tag */ memset((uint8_t *)ctx->gcm_tmp, 0, block_size); memcpy((uint8_t *)ctx->gcm_tmp, datap, ctx->gcm_remainder_len); /* add ciphertext to the hash */ GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); /* decrypt remaining ciphertext */ encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); /* XOR with counter block */ for (i = 0; i < ctx->gcm_remainder_len; i++) { outp[i] = datap[i] ^ counterp[i]; } } int gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { (void) out, (void) block_size, (void) encrypt_block, (void) copy_block, (void) xor_block; size_t new_len; uint8_t *new; /* * Copy contiguous ciphertext input blocks to plaintext buffer. * Ciphertext will be decrypted in the final. */ if (length > 0) { new_len = ctx->gcm_pt_buf_len + length; new = vmem_alloc(new_len, KM_SLEEP); if (new == NULL) { vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); ctx->gcm_pt_buf = NULL; return (CRYPTO_HOST_MEMORY); } if (ctx->gcm_pt_buf != NULL) { memcpy(new, ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); } else { ASSERT0(ctx->gcm_pt_buf_len); } ctx->gcm_pt_buf = new; ctx->gcm_pt_buf_len = new_len; memcpy(&ctx->gcm_pt_buf[ctx->gcm_processed_data_len], data, length); ctx->gcm_processed_data_len += length; } ctx->gcm_remainder_len = 0; return (CRYPTO_SUCCESS); } int gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { #ifdef CAN_USE_GCM_ASM if (ctx->gcm_use_avx == B_TRUE) return (gcm_decrypt_final_avx(ctx, out, block_size)); #endif const gcm_impl_ops_t *gops; size_t pt_len; size_t remainder; uint8_t *ghash; uint8_t *blockp; uint8_t *cbp; uint64_t counter; uint64_t counter_mask = ntohll(0x00000000ffffffffULL); int processed = 0, rv; ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); gops = gcm_impl_get_ops(); pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; ghash = (uint8_t *)ctx->gcm_ghash; blockp = ctx->gcm_pt_buf; remainder = pt_len; while (remainder > 0) { /* Incomplete last block */ if (remainder < block_size) { memcpy(ctx->gcm_remainder, blockp, remainder); ctx->gcm_remainder_len = remainder; /* * not expecting anymore ciphertext, just * compute plaintext for the remaining input */ gcm_decrypt_incomplete_block(ctx, block_size, processed, encrypt_block, xor_block); ctx->gcm_remainder_len = 0; goto out; } /* add ciphertext to the hash */ GHASH(ctx, blockp, ghash, gops); /* * Increment counter. * Counter bits are confined to the bottom 32 bits */ counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + 1); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; cbp = (uint8_t *)ctx->gcm_tmp; encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); /* XOR with ciphertext */ xor_block(cbp, blockp); processed += block_size; blockp += block_size; remainder -= block_size; } out: ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, (uint8_t *)ctx->gcm_J0); xor_block((uint8_t *)ctx->gcm_J0, ghash); /* compare the input authentication tag with what we calculated */ if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { /* They don't match */ return (CRYPTO_INVALID_MAC); } else { rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += pt_len; } return (CRYPTO_SUCCESS); } static int gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) { size_t tag_len; /* * Check the length of the authentication tag (in bits). */ tag_len = gcm_param->ulTagBits; switch (tag_len) { case 32: case 64: case 96: case 104: case 112: case 120: case 128: break; default: return (CRYPTO_MECHANISM_PARAM_INVALID); } if (gcm_param->ulIvLen == 0) return (CRYPTO_MECHANISM_PARAM_INVALID); return (CRYPTO_SUCCESS); } static void gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, gcm_ctx_t *ctx, size_t block_size, void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { const gcm_impl_ops_t *gops; uint8_t *cb; ulong_t remainder = iv_len; ulong_t processed = 0; uint8_t *datap, *ghash; uint64_t len_a_len_c[2]; gops = gcm_impl_get_ops(); ghash = (uint8_t *)ctx->gcm_ghash; cb = (uint8_t *)ctx->gcm_cb; if (iv_len == 12) { memcpy(cb, iv, 12); cb[12] = 0; cb[13] = 0; cb[14] = 0; cb[15] = 1; /* J0 will be used again in the final */ copy_block(cb, (uint8_t *)ctx->gcm_J0); } else { /* GHASH the IV */ do { if (remainder < block_size) { memset(cb, 0, block_size); memcpy(cb, &(iv[processed]), remainder); datap = (uint8_t *)cb; remainder = 0; } else { datap = (uint8_t *)(&(iv[processed])); processed += block_size; remainder -= block_size; } GHASH(ctx, datap, ghash, gops); } while (remainder > 0); len_a_len_c[0] = 0; len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); /* J0 will be used again in the final */ copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); } } static int gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { const gcm_impl_ops_t *gops; uint8_t *ghash, *datap, *authp; size_t remainder, processed; /* encrypt zero block to get subkey H */ memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, (uint8_t *)ctx->gcm_H); gcm_format_initial_blocks(iv, iv_len, ctx, block_size, copy_block, xor_block); gops = gcm_impl_get_ops(); authp = (uint8_t *)ctx->gcm_tmp; ghash = (uint8_t *)ctx->gcm_ghash; memset(authp, 0, block_size); memset(ghash, 0, block_size); processed = 0; remainder = auth_data_len; do { if (remainder < block_size) { /* * There's not a block full of data, pad rest of * buffer with zero */ if (auth_data != NULL) { memset(authp, 0, block_size); memcpy(authp, &(auth_data[processed]), remainder); } else { ASSERT0(remainder); } datap = (uint8_t *)authp; remainder = 0; } else { datap = (uint8_t *)(&(auth_data[processed])); processed += block_size; remainder -= block_size; } /* add auth data to the hash */ GHASH(ctx, datap, ghash, gops); } while (remainder > 0); return (CRYPTO_SUCCESS); } /* * The following function is called at encrypt or decrypt init time * for AES GCM mode. * * Init the GCM context struct. Handle the cycle and avx implementations here. */ int gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { int rv; CK_AES_GCM_PARAMS *gcm_param; if (param != NULL) { gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; if ((rv = gcm_validate_args(gcm_param)) != 0) { return (rv); } gcm_ctx->gcm_tag_len = gcm_param->ulTagBits; gcm_ctx->gcm_tag_len >>= 3; gcm_ctx->gcm_processed_data_len = 0; /* these values are in bits */ gcm_ctx->gcm_len_a_len_c[0] = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); rv = CRYPTO_SUCCESS; gcm_ctx->gcm_flags |= GCM_MODE; } else { return (CRYPTO_MECHANISM_PARAM_INVALID); } #ifdef CAN_USE_GCM_ASM if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; } else { /* * Handle the "cycle" implementation by creating avx and * non-avx contexts alternately. */ gcm_ctx->gcm_use_avx = gcm_toggle_avx(); /* * We don't handle byte swapped key schedules in the avx * code path. */ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; if (ks->ops->needs_byteswap == B_TRUE) { gcm_ctx->gcm_use_avx = B_FALSE; } /* Use the MOVBE and the BSWAP variants alternately. */ if (gcm_ctx->gcm_use_avx == B_TRUE && zfs_movbe_available() == B_TRUE) { (void) atomic_toggle_boolean_nv( (volatile boolean_t *)&gcm_avx_can_use_movbe); } } /* Allocate Htab memory as needed. */ if (gcm_ctx->gcm_use_avx == B_TRUE) { size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); if (htab_len == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); } gcm_ctx->gcm_htab_len = htab_len; gcm_ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP); if (gcm_ctx->gcm_Htable == NULL) { return (CRYPTO_HOST_MEMORY); } } /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, gcm_param->pAAD, gcm_param->ulAADLen, block_size, encrypt_block, copy_block, xor_block) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } #ifdef CAN_USE_GCM_ASM } else { if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } } #endif /* ifdef CAN_USE_GCM_ASM */ return (rv); } int gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), void (*copy_block)(uint8_t *, uint8_t *), void (*xor_block)(uint8_t *, uint8_t *)) { int rv; CK_AES_GMAC_PARAMS *gmac_param; if (param != NULL) { gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param; gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); gcm_ctx->gcm_processed_data_len = 0; /* these values are in bits */ gcm_ctx->gcm_len_a_len_c[0] = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen)); rv = CRYPTO_SUCCESS; gcm_ctx->gcm_flags |= GMAC_MODE; } else { return (CRYPTO_MECHANISM_PARAM_INVALID); } #ifdef CAN_USE_GCM_ASM /* * Handle the "cycle" implementation by creating avx and non avx * contexts alternately. */ if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; } else { gcm_ctx->gcm_use_avx = gcm_toggle_avx(); } /* We don't handle byte swapped key schedules in the avx code path. */ aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; if (ks->ops->needs_byteswap == B_TRUE) { gcm_ctx->gcm_use_avx = B_FALSE; } /* Allocate Htab memory as needed. */ if (gcm_ctx->gcm_use_avx == B_TRUE) { size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); if (htab_len == 0) { return (CRYPTO_MECHANISM_PARAM_INVALID); } gcm_ctx->gcm_htab_len = htab_len; gcm_ctx->gcm_Htable = kmem_alloc(htab_len, KM_SLEEP); if (gcm_ctx->gcm_Htable == NULL) { return (CRYPTO_HOST_MEMORY); } } /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, gmac_param->pAAD, gmac_param->ulAADLen, block_size, encrypt_block, copy_block, xor_block) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } #ifdef CAN_USE_GCM_ASM } else { if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) { rv = CRYPTO_MECHANISM_PARAM_INVALID; } } #endif /* ifdef CAN_USE_GCM_ASM */ return (rv); } void * gcm_alloc_ctx(int kmflag) { gcm_ctx_t *gcm_ctx; if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) return (NULL); gcm_ctx->gcm_flags = GCM_MODE; return (gcm_ctx); } void * gmac_alloc_ctx(int kmflag) { gcm_ctx_t *gcm_ctx; if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) return (NULL); gcm_ctx->gcm_flags = GMAC_MODE; return (gcm_ctx); } /* GCM implementation that contains the fastest methods */ static gcm_impl_ops_t gcm_fastest_impl = { .name = "fastest" }; /* All compiled in implementations */ static const gcm_impl_ops_t *gcm_all_impl[] = { &gcm_generic_impl, #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) &gcm_pclmulqdq_impl, #endif }; /* Indicate that benchmark has been completed */ static boolean_t gcm_impl_initialized = B_FALSE; /* Hold all supported implementations */ static size_t gcm_supp_impl_cnt = 0; static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; /* * Returns the GCM operations for encrypt/decrypt/key setup. When a * SIMD implementation is not allowed in the current context, then * fallback to the fastest generic implementation. */ const gcm_impl_ops_t * gcm_impl_get_ops(void) { if (!kfpu_allowed()) return (&gcm_generic_impl); const gcm_impl_ops_t *ops = NULL; const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); switch (impl) { case IMPL_FASTEST: ASSERT(gcm_impl_initialized); ops = &gcm_fastest_impl; break; case IMPL_CYCLE: /* Cycle through supported implementations */ ASSERT(gcm_impl_initialized); ASSERT3U(gcm_supp_impl_cnt, >, 0); static size_t cycle_impl_idx = 0; size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; ops = gcm_supp_impl[idx]; break; #ifdef CAN_USE_GCM_ASM case IMPL_AVX: /* * Make sure that we return a valid implementation while * switching to the avx implementation since there still * may be unfinished non-avx contexts around. */ ops = &gcm_generic_impl; break; #endif default: ASSERT3U(impl, <, gcm_supp_impl_cnt); ASSERT3U(gcm_supp_impl_cnt, >, 0); if (impl < ARRAY_SIZE(gcm_all_impl)) ops = gcm_supp_impl[impl]; break; } ASSERT3P(ops, !=, NULL); return (ops); } /* * Initialize all supported implementations. */ void gcm_impl_init(void) { gcm_impl_ops_t *curr_impl; int i, c; /* Move supported implementations into gcm_supp_impls */ for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; if (curr_impl->is_supported()) gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; } gcm_supp_impl_cnt = c; /* * Set the fastest implementation given the assumption that the * hardware accelerated version is the fastest. */ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) if (gcm_pclmulqdq_impl.is_supported()) { memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, sizeof (gcm_fastest_impl)); } else #endif { memcpy(&gcm_fastest_impl, &gcm_generic_impl, sizeof (gcm_fastest_impl)); } strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); #ifdef CAN_USE_GCM_ASM /* * Use the avx implementation if it's available and the implementation * hasn't changed from its default value of fastest on module load. */ if (gcm_avx_will_work()) { #ifdef HAVE_MOVBE if (zfs_movbe_available() == B_TRUE) { atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); } #endif if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { gcm_set_avx(B_TRUE); } } #endif /* Finish initialization */ atomic_swap_32(&icp_gcm_impl, user_sel_impl); gcm_impl_initialized = B_TRUE; } static const struct { const char *name; uint32_t sel; } gcm_impl_opts[] = { { "cycle", IMPL_CYCLE }, { "fastest", IMPL_FASTEST }, #ifdef CAN_USE_GCM_ASM { "avx", IMPL_AVX }, #endif }; /* * Function sets desired gcm implementation. * * If we are called before init(), user preference will be saved in * user_sel_impl, and applied in later init() call. This occurs when module * parameter is specified on module load. Otherwise, directly update * icp_gcm_impl. * * @val Name of gcm implementation to use * @param Unused. */ int gcm_impl_set(const char *val) { int err = -EINVAL; char req_name[GCM_IMPL_NAME_MAX]; uint32_t impl = GCM_IMPL_READ(user_sel_impl); size_t i; /* sanitize input */ i = strnlen(val, GCM_IMPL_NAME_MAX); if (i == 0 || i >= GCM_IMPL_NAME_MAX) return (err); strlcpy(req_name, val, GCM_IMPL_NAME_MAX); while (i > 0 && isspace(req_name[i-1])) i--; req_name[i] = '\0'; /* Check mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM /* Ignore avx implementation if it won't work. */ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } #endif if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { impl = gcm_impl_opts[i].sel; err = 0; break; } } /* check all supported impl if init() was already called */ if (err != 0 && gcm_impl_initialized) { /* check all supported implementations */ for (i = 0; i < gcm_supp_impl_cnt; i++) { if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { impl = i; err = 0; break; } } } #ifdef CAN_USE_GCM_ASM /* * Use the avx implementation if available and the requested one is * avx or fastest. */ if (gcm_avx_will_work() == B_TRUE && (impl == IMPL_AVX || impl == IMPL_FASTEST)) { gcm_set_avx(B_TRUE); } else { gcm_set_avx(B_FALSE); } #endif if (err == 0) { if (gcm_impl_initialized) atomic_swap_32(&icp_gcm_impl, impl); else atomic_swap_32(&user_sel_impl, impl); } return (err); } #if defined(_KERNEL) && defined(__linux__) static int icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) { return (gcm_impl_set(val)); } static int icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) { int i, cnt = 0; char *fmt; const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); ASSERT(gcm_impl_initialized); /* list mandatory options */ for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { #ifdef CAN_USE_GCM_ASM /* Ignore avx implementation if it won't work. */ if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { continue; } #endif fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, gcm_impl_opts[i].name); } /* list all supported implementations */ for (i = 0; i < gcm_supp_impl_cnt; i++) { fmt = (i == impl) ? "[%s] " : "%s "; cnt += kmem_scnprintf(buffer + cnt, PAGE_SIZE - cnt, fmt, gcm_supp_impl[i]->name); } return (cnt); } module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, NULL, 0644); MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); #endif /* defined(__KERNEL) */ #ifdef CAN_USE_GCM_ASM #define GCM_BLOCK_LEN 16 /* * The openssl asm routines are 6x aggregated and need that many bytes * at minimum. */ #define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) #define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) /* * Ensure the chunk size is reasonable since we are allocating a * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. */ #define GCM_AVX_MAX_CHUNK_SIZE \ (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) /* Clear the FPU registers since they hold sensitive internal state. */ #define clear_fpu_regs() clear_fpu_regs_avx() #define GHASH_AVX(ctx, in, len) \ gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ in, len) #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) /* Get the chunk size module parameter. */ #define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size /* * Module parameter: number of bytes to process at once while owning the FPU. * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. */ static uint32_t gcm_avx_chunk_size = ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; -extern void clear_fpu_regs_avx(void); -extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); -extern void aes_encrypt_intel(const uint32_t rk[], int nr, +extern void ASMABI clear_fpu_regs_avx(void); +extern void ASMABI gcm_xor_avx(const uint8_t *src, uint8_t *dst); +extern void ASMABI aes_encrypt_intel(const uint32_t rk[], int nr, const uint32_t pt[4], uint32_t ct[4]); -extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); -extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, +extern void ASMABI gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); +extern void ASMABI gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, const uint8_t *in, size_t len); -extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, +extern size_t ASMABI aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); -extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, +extern size_t ASMABI aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, const void *, uint64_t *, uint64_t *); static inline boolean_t gcm_avx_will_work(void) { /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ return (kfpu_allowed() && zfs_avx_available() && zfs_aes_available() && zfs_pclmulqdq_available()); } static inline void gcm_set_avx(boolean_t val) { if (gcm_avx_will_work() == B_TRUE) { atomic_swap_32(&gcm_use_avx, val); } } static inline boolean_t gcm_toggle_avx(void) { if (gcm_avx_will_work() == B_TRUE) { return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); } else { return (B_FALSE); } } static inline size_t gcm_simd_get_htab_size(boolean_t simd_mode) { switch (simd_mode) { case B_TRUE: return (2 * 6 * 2 * sizeof (uint64_t)); default: return (0); } } /* * Clear sensitive data in the context. * * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and * ctx->gcm_Htable contain the hash sub key which protects authentication. * * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for * a known plaintext attack, they consists of the IV and the first and last * counter respectively. If they should be cleared is debatable. */ static inline void gcm_clear_ctx(gcm_ctx_t *ctx) { memset(ctx->gcm_remainder, 0, sizeof (ctx->gcm_remainder)); memset(ctx->gcm_H, 0, sizeof (ctx->gcm_H)); memset(ctx->gcm_J0, 0, sizeof (ctx->gcm_J0)); memset(ctx->gcm_tmp, 0, sizeof (ctx->gcm_tmp)); } /* Increment the GCM counter block by n. */ static inline void gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) { uint64_t counter_mask = ntohll(0x00000000ffffffffULL); uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); counter = htonll(counter + n); counter &= counter_mask; ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; } /* * Encrypt multiple blocks of data in GCM mode. * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines * if possible. While processing a chunk the FPU is "locked". */ static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, size_t length, crypto_data_t *out, size_t block_size) { size_t bleft = length; size_t need = 0; size_t done = 0; uint8_t *datap = (uint8_t *)data; size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint64_t *ghash = ctx->gcm_ghash; uint64_t *cb = ctx->gcm_cb; uint8_t *ct_buf = NULL; uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; int rv = CRYPTO_SUCCESS; ASSERT(block_size == GCM_BLOCK_LEN); /* * If the last call left an incomplete block, try to fill * it first. */ if (ctx->gcm_remainder_len > 0) { need = block_size - ctx->gcm_remainder_len; if (length < need) { /* Accumulate bytes here and return. */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, length); ctx->gcm_remainder_len += length; if (ctx->gcm_copy_to == NULL) { ctx->gcm_copy_to = datap; } return (CRYPTO_SUCCESS); } else { /* Complete incomplete block. */ memcpy((uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, datap, need); ctx->gcm_copy_to = NULL; } } /* Allocate a buffer to encrypt to if there is enough input. */ if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { ct_buf = vmem_alloc(chunk_size, KM_SLEEP); if (ct_buf == NULL) { return (CRYPTO_HOST_MEMORY); } } /* If we completed an incomplete block, encrypt and write it out. */ if (ctx->gcm_remainder_len > 0) { kfpu_begin(); aes_encrypt_intel(key->encr_ks.ks32, key->nr, (const uint32_t *)cb, (uint32_t *)tmp); gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); GHASH_AVX(ctx, tmp, block_size); clear_fpu_regs(); kfpu_end(); rv = crypto_put_output_data(tmp, out, block_size); out->cd_offset += block_size; gcm_incr_counter_block(ctx); ctx->gcm_processed_data_len += block_size; bleft -= need; datap += need; ctx->gcm_remainder_len = 0; } /* Do the bulk encryption in chunk_size blocks. */ for (; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); done = aesni_gcm_encrypt( datap, ct_buf, chunk_size, key, cb, ghash); clear_fpu_regs(); kfpu_end(); if (done != chunk_size) { rv = CRYPTO_FAILED; goto out_nofpu; } rv = crypto_put_output_data(ct_buf, out, chunk_size); if (rv != CRYPTO_SUCCESS) { goto out_nofpu; } out->cd_offset += chunk_size; datap += chunk_size; ctx->gcm_processed_data_len += chunk_size; } /* Check if we are already done. */ if (bleft == 0) { goto out_nofpu; } /* Bulk encrypt the remaining data. */ kfpu_begin(); if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); if (done == 0) { rv = CRYPTO_FAILED; goto out; } rv = crypto_put_output_data(ct_buf, out, done); if (rv != CRYPTO_SUCCESS) { goto out; } out->cd_offset += done; ctx->gcm_processed_data_len += done; datap += done; bleft -= done; } /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ while (bleft > 0) { if (bleft < block_size) { memcpy(ctx->gcm_remainder, datap, bleft); ctx->gcm_remainder_len = bleft; ctx->gcm_copy_to = datap; goto out; } /* Encrypt, hash and write out. */ aes_encrypt_intel(key->encr_ks.ks32, key->nr, (const uint32_t *)cb, (uint32_t *)tmp); gcm_xor_avx(datap, tmp); GHASH_AVX(ctx, tmp, block_size); rv = crypto_put_output_data(tmp, out, block_size); if (rv != CRYPTO_SUCCESS) { goto out; } out->cd_offset += block_size; gcm_incr_counter_block(ctx); ctx->gcm_processed_data_len += block_size; datap += block_size; bleft -= block_size; } out: clear_fpu_regs(); kfpu_end(); out_nofpu: if (ct_buf != NULL) { vmem_free(ct_buf, chunk_size); } return (rv); } /* * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. */ static int gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) { uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; uint32_t *J0 = (uint32_t *)ctx->gcm_J0; uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; size_t rem_len = ctx->gcm_remainder_len; const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; int aes_rounds = ((aes_key_t *)keysched)->nr; int rv; ASSERT(block_size == GCM_BLOCK_LEN); if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { return (CRYPTO_DATA_LEN_RANGE); } kfpu_begin(); /* Pad last incomplete block with zeros, encrypt and hash. */ if (rem_len > 0) { uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; const uint32_t *cb = (uint32_t *)ctx->gcm_cb; aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); memset(remainder + rem_len, 0, block_size - rem_len); for (int i = 0; i < rem_len; i++) { remainder[i] ^= tmp[i]; } GHASH_AVX(ctx, remainder, block_size); ctx->gcm_processed_data_len += rem_len; /* No need to increment counter_block, it's the last block. */ } /* Finish tag. */ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); aes_encrypt_intel(keysched, aes_rounds, J0, J0); gcm_xor_avx((uint8_t *)J0, ghash); clear_fpu_regs(); kfpu_end(); /* Output remainder. */ if (rem_len > 0) { rv = crypto_put_output_data(remainder, out, rem_len); if (rv != CRYPTO_SUCCESS) return (rv); } out->cd_offset += rem_len; ctx->gcm_remainder_len = 0; rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); if (rv != CRYPTO_SUCCESS) return (rv); out->cd_offset += ctx->gcm_tag_len; /* Clear sensitive data in the context before returning. */ gcm_clear_ctx(ctx); return (CRYPTO_SUCCESS); } /* * Finalize decryption: We just have accumulated crypto text, so now we * decrypt it here inplace. */ static int gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) { ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); ASSERT3U(block_size, ==, 16); size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; uint8_t *datap = ctx->gcm_pt_buf; const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); uint32_t *cb = (uint32_t *)ctx->gcm_cb; uint64_t *ghash = ctx->gcm_ghash; uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; int rv = CRYPTO_SUCCESS; size_t bleft, done; /* * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of * GCM_AVX_MIN_DECRYPT_BYTES. */ for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { kfpu_begin(); done = aesni_gcm_decrypt(datap, datap, chunk_size, (const void *)key, ctx->gcm_cb, ghash); clear_fpu_regs(); kfpu_end(); if (done != chunk_size) { return (CRYPTO_FAILED); } datap += done; } /* Decrypt remainder, which is less than chunk size, in one go. */ kfpu_begin(); if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { done = aesni_gcm_decrypt(datap, datap, bleft, (const void *)key, ctx->gcm_cb, ghash); if (done == 0) { clear_fpu_regs(); kfpu_end(); return (CRYPTO_FAILED); } datap += done; bleft -= done; } ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); /* * Now less than GCM_AVX_MIN_DECRYPT_BYTES bytes remain, * decrypt them block by block. */ while (bleft > 0) { /* Incomplete last block. */ if (bleft < block_size) { uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; memset(lastb, 0, block_size); memcpy(lastb, datap, bleft); /* The GCM processing. */ GHASH_AVX(ctx, lastb, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); for (size_t i = 0; i < bleft; i++) { datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; } break; } /* The GCM processing. */ GHASH_AVX(ctx, datap, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); gcm_xor_avx((uint8_t *)tmp, datap); gcm_incr_counter_block(ctx); datap += block_size; bleft -= block_size; } if (rv != CRYPTO_SUCCESS) { clear_fpu_regs(); kfpu_end(); return (rv); } /* Decryption done, finish the tag. */ ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, (uint32_t *)ctx->gcm_J0); gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); /* We are done with the FPU, restore its state. */ clear_fpu_regs(); kfpu_end(); /* Compare the input authentication tag with what we calculated. */ if (memcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { /* They don't match. */ return (CRYPTO_INVALID_MAC); } rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); if (rv != CRYPTO_SUCCESS) { return (rv); } out->cd_offset += pt_len; gcm_clear_ctx(ctx); return (CRYPTO_SUCCESS); } /* * Initialize the GCM params H, Htabtle and the counter block. Save the * initial counter block. */ static int gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, unsigned char *auth_data, size_t auth_data_len, size_t block_size) { uint8_t *cb = (uint8_t *)ctx->gcm_cb; uint64_t *H = ctx->gcm_H; const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; uint8_t *datap = auth_data; size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; size_t bleft; ASSERT(block_size == GCM_BLOCK_LEN); /* Init H (encrypt zero block) and create the initial counter block. */ memset(ctx->gcm_ghash, 0, sizeof (ctx->gcm_ghash)); memset(H, 0, sizeof (ctx->gcm_H)); kfpu_begin(); aes_encrypt_intel(keysched, aes_rounds, (const uint32_t *)H, (uint32_t *)H); gcm_init_htab_avx(ctx->gcm_Htable, H); if (iv_len == 12) { memcpy(cb, iv, 12); cb[12] = 0; cb[13] = 0; cb[14] = 0; cb[15] = 1; /* We need the ICB later. */ memcpy(ctx->gcm_J0, cb, sizeof (ctx->gcm_J0)); } else { /* * Most consumers use 12 byte IVs, so it's OK to use the * original routines for other IV sizes, just avoid nesting * kfpu_begin calls. */ clear_fpu_regs(); kfpu_end(); gcm_format_initial_blocks(iv, iv_len, ctx, block_size, aes_copy_block, aes_xor_block); kfpu_begin(); } /* Openssl post increments the counter, adjust for that. */ gcm_incr_counter_block(ctx); /* Ghash AAD in chunk_size blocks. */ for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { GHASH_AVX(ctx, datap, chunk_size); datap += chunk_size; clear_fpu_regs(); kfpu_end(); kfpu_begin(); } /* Ghash the remainder and handle possible incomplete GCM block. */ if (bleft > 0) { size_t incomp = bleft % block_size; bleft -= incomp; if (bleft > 0) { GHASH_AVX(ctx, datap, bleft); datap += bleft; } if (incomp > 0) { /* Zero pad and hash incomplete last block. */ uint8_t *authp = (uint8_t *)ctx->gcm_tmp; memset(authp, 0, block_size); memcpy(authp, datap, incomp); GHASH_AVX(ctx, authp, block_size); } } clear_fpu_regs(); kfpu_end(); return (CRYPTO_SUCCESS); } #if defined(_KERNEL) static int icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) { unsigned long val; char val_rounded[16]; int error = 0; error = kstrtoul(buf, 0, &val); if (error) return (error); val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) return (-EINVAL); snprintf(val_rounded, 16, "%u", (uint32_t)val); error = param_set_uint(val_rounded, kp); return (error); } module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, param_get_uint, &gcm_avx_chunk_size, 0644); MODULE_PARM_DESC(icp_gcm_avx_chunk_size, "How many bytes to process while owning the FPU"); #endif /* defined(__KERNEL) */ #endif /* ifdef CAN_USE_GCM_ASM */ diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c index c2c8bc221203..737d2e47ecb7 100644 --- a/module/icp/algs/modes/gcm_pclmulqdq.c +++ b/module/icp/algs/modes/gcm_pclmulqdq.c @@ -1,64 +1,65 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. */ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) #include #include +#include /* These functions are used to execute pclmulqdq based assembly methods */ -extern void gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *); +extern void ASMABI gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *); #include /* * Perform a carry-less multiplication (that is, use XOR instead of the * multiply operator) on *x_in and *y and place the result in *res. * * Byte swap the input (*x_in and *y) and the output (*res). * * Note: x_in, y, and res all point to 16-byte numbers (an array of two * 64-bit integers). */ static void gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res) { kfpu_begin(); gcm_mul_pclmulqdq(x_in, y, res); kfpu_end(); } static boolean_t gcm_pclmulqdq_will_work(void) { return (kfpu_allowed() && zfs_pclmulqdq_available()); } const gcm_impl_ops_t gcm_pclmulqdq_impl = { .mul = &gcm_pclmulqdq_mul, .is_supported = &gcm_pclmulqdq_will_work, .name = "pclmulqdq" }; #endif /* defined(__x86_64) && defined(HAVE_PCLMULQDQ) */ diff --git a/module/icp/algs/sha2/sha2.c b/module/icp/algs/sha2/sha2.c index 151432f1a5df..e6bbe34eaa57 100644 --- a/module/icp/algs/sha2/sha2.c +++ b/module/icp/algs/sha2/sha2.c @@ -1,956 +1,957 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright 2013 Saso Kiselkov. All rights reserved. */ /* * The basic framework for this code came from the reference * implementation for MD5. That implementation is Copyright (C) * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. * * License to copy and use this software is granted provided that it * is identified as the "RSA Data Security, Inc. MD5 Message-Digest * Algorithm" in all material mentioning or referencing this software * or this function. * * License is also granted to make and use derivative works provided * that such works are identified as "derived from the RSA Data * Security, Inc. MD5 Message-Digest Algorithm" in all material * mentioning or referencing the derived work. * * RSA Data Security, Inc. makes no representations concerning either * the merchantability of this software or the suitability of this * software for any particular purpose. It is provided "as is" * without express or implied warranty of any kind. * * These notices must be retained in any copies of any part of this * documentation and/or software. * * NOTE: Cleaned-up and optimized, version of SHA2, based on the FIPS 180-2 * standard, available at * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf * Not as fast as one would like -- further optimizations are encouraged * and appreciated. */ #include #define _SHA2_IMPL #include #include #define _RESTRICT_KYWD #ifdef _ZFS_LITTLE_ENDIAN #include #define HAVE_HTONL #endif #include /* for _ILP32 */ +#include static void Encode(uint8_t *, uint32_t *, size_t); static void Encode64(uint8_t *, uint64_t *, size_t); /* userspace only supports the generic version */ #if defined(__amd64) && defined(_KERNEL) #define SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1) #define SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1) -void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); -void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); +void ASMABI SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); +void ASMABI SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); #else static void SHA256Transform(SHA2_CTX *, const uint8_t *); static void SHA512Transform(SHA2_CTX *, const uint8_t *); #endif /* __amd64 && _KERNEL */ static const uint8_t PADDING[128] = { 0x80, /* all zeros */ }; /* * The low-level checksum routines use a lot of stack space. On systems where * small stacks are enforced (like 32-bit kernel builds), insert compiler memory * barriers to reduce stack frame size. This can reduce the SHA512Transform() * stack frame usage from 3k to <1k on ARM32, for example. */ #if defined(_ILP32) || defined(__powerpc) /* small stack */ #define SMALL_STACK_MEMORY_BARRIER asm volatile("": : :"memory"); #else #define SMALL_STACK_MEMORY_BARRIER #endif /* Ch and Maj are the basic SHA2 functions. */ #define Ch(b, c, d) (((b) & (c)) ^ ((~b) & (d))) #define Maj(b, c, d) (((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d))) /* Rotates x right n bits. */ #define ROTR(x, n) \ (((x) >> (n)) | ((x) << ((sizeof (x) * NBBY)-(n)))) /* Shift x right n bits */ #define SHR(x, n) ((x) >> (n)) /* SHA256 Functions */ #define BIGSIGMA0_256(x) (ROTR((x), 2) ^ ROTR((x), 13) ^ ROTR((x), 22)) #define BIGSIGMA1_256(x) (ROTR((x), 6) ^ ROTR((x), 11) ^ ROTR((x), 25)) #define SIGMA0_256(x) (ROTR((x), 7) ^ ROTR((x), 18) ^ SHR((x), 3)) #define SIGMA1_256(x) (ROTR((x), 17) ^ ROTR((x), 19) ^ SHR((x), 10)) #define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \ T1 = h + BIGSIGMA1_256(e) + Ch(e, f, g) + SHA256_CONST(i) + w; \ d += T1; \ T2 = BIGSIGMA0_256(a) + Maj(a, b, c); \ h = T1 + T2 /* SHA384/512 Functions */ #define BIGSIGMA0(x) (ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39)) #define BIGSIGMA1(x) (ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41)) #define SIGMA0(x) (ROTR((x), 1) ^ ROTR((x), 8) ^ SHR((x), 7)) #define SIGMA1(x) (ROTR((x), 19) ^ ROTR((x), 61) ^ SHR((x), 6)) #define SHA512ROUND(a, b, c, d, e, f, g, h, i, w) \ T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w; \ d += T1; \ T2 = BIGSIGMA0(a) + Maj(a, b, c); \ h = T1 + T2; \ SMALL_STACK_MEMORY_BARRIER; /* * sparc optimization: * * on the sparc, we can load big endian 32-bit data easily. note that * special care must be taken to ensure the address is 32-bit aligned. * in the interest of speed, we don't check to make sure, since * careful programming can guarantee this for us. */ #if defined(_ZFS_BIG_ENDIAN) #define LOAD_BIG_32(addr) (*(uint32_t *)(addr)) #define LOAD_BIG_64(addr) (*(uint64_t *)(addr)) #elif defined(HAVE_HTONL) #define LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr))) #define LOAD_BIG_64(addr) htonll(*((uint64_t *)(addr))) #else /* little endian -- will work on big endian, but slowly */ #define LOAD_BIG_32(addr) \ (((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3]) #define LOAD_BIG_64(addr) \ (((uint64_t)(addr)[0] << 56) | ((uint64_t)(addr)[1] << 48) | \ ((uint64_t)(addr)[2] << 40) | ((uint64_t)(addr)[3] << 32) | \ ((uint64_t)(addr)[4] << 24) | ((uint64_t)(addr)[5] << 16) | \ ((uint64_t)(addr)[6] << 8) | (uint64_t)(addr)[7]) #endif /* _BIG_ENDIAN */ #if !defined(__amd64) || !defined(_KERNEL) /* SHA256 Transform */ static void SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk) { uint32_t a = ctx->state.s32[0]; uint32_t b = ctx->state.s32[1]; uint32_t c = ctx->state.s32[2]; uint32_t d = ctx->state.s32[3]; uint32_t e = ctx->state.s32[4]; uint32_t f = ctx->state.s32[5]; uint32_t g = ctx->state.s32[6]; uint32_t h = ctx->state.s32[7]; uint32_t w0, w1, w2, w3, w4, w5, w6, w7; uint32_t w8, w9, w10, w11, w12, w13, w14, w15; uint32_t T1, T2; #if defined(__sparc) static const uint32_t sha256_consts[] = { SHA256_CONST_0, SHA256_CONST_1, SHA256_CONST_2, SHA256_CONST_3, SHA256_CONST_4, SHA256_CONST_5, SHA256_CONST_6, SHA256_CONST_7, SHA256_CONST_8, SHA256_CONST_9, SHA256_CONST_10, SHA256_CONST_11, SHA256_CONST_12, SHA256_CONST_13, SHA256_CONST_14, SHA256_CONST_15, SHA256_CONST_16, SHA256_CONST_17, SHA256_CONST_18, SHA256_CONST_19, SHA256_CONST_20, SHA256_CONST_21, SHA256_CONST_22, SHA256_CONST_23, SHA256_CONST_24, SHA256_CONST_25, SHA256_CONST_26, SHA256_CONST_27, SHA256_CONST_28, SHA256_CONST_29, SHA256_CONST_30, SHA256_CONST_31, SHA256_CONST_32, SHA256_CONST_33, SHA256_CONST_34, SHA256_CONST_35, SHA256_CONST_36, SHA256_CONST_37, SHA256_CONST_38, SHA256_CONST_39, SHA256_CONST_40, SHA256_CONST_41, SHA256_CONST_42, SHA256_CONST_43, SHA256_CONST_44, SHA256_CONST_45, SHA256_CONST_46, SHA256_CONST_47, SHA256_CONST_48, SHA256_CONST_49, SHA256_CONST_50, SHA256_CONST_51, SHA256_CONST_52, SHA256_CONST_53, SHA256_CONST_54, SHA256_CONST_55, SHA256_CONST_56, SHA256_CONST_57, SHA256_CONST_58, SHA256_CONST_59, SHA256_CONST_60, SHA256_CONST_61, SHA256_CONST_62, SHA256_CONST_63 }; #endif /* __sparc */ if ((uintptr_t)blk & 0x3) { /* not 4-byte aligned? */ memcpy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32)); blk = (uint8_t *)ctx->buf_un.buf32; } /* LINTED E_BAD_PTR_CAST_ALIGN */ w0 = LOAD_BIG_32(blk + 4 * 0); SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0); /* LINTED E_BAD_PTR_CAST_ALIGN */ w1 = LOAD_BIG_32(blk + 4 * 1); SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1); /* LINTED E_BAD_PTR_CAST_ALIGN */ w2 = LOAD_BIG_32(blk + 4 * 2); SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2); /* LINTED E_BAD_PTR_CAST_ALIGN */ w3 = LOAD_BIG_32(blk + 4 * 3); SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3); /* LINTED E_BAD_PTR_CAST_ALIGN */ w4 = LOAD_BIG_32(blk + 4 * 4); SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4); /* LINTED E_BAD_PTR_CAST_ALIGN */ w5 = LOAD_BIG_32(blk + 4 * 5); SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5); /* LINTED E_BAD_PTR_CAST_ALIGN */ w6 = LOAD_BIG_32(blk + 4 * 6); SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6); /* LINTED E_BAD_PTR_CAST_ALIGN */ w7 = LOAD_BIG_32(blk + 4 * 7); SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7); /* LINTED E_BAD_PTR_CAST_ALIGN */ w8 = LOAD_BIG_32(blk + 4 * 8); SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8); /* LINTED E_BAD_PTR_CAST_ALIGN */ w9 = LOAD_BIG_32(blk + 4 * 9); SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9); /* LINTED E_BAD_PTR_CAST_ALIGN */ w10 = LOAD_BIG_32(blk + 4 * 10); SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10); /* LINTED E_BAD_PTR_CAST_ALIGN */ w11 = LOAD_BIG_32(blk + 4 * 11); SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11); /* LINTED E_BAD_PTR_CAST_ALIGN */ w12 = LOAD_BIG_32(blk + 4 * 12); SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12); /* LINTED E_BAD_PTR_CAST_ALIGN */ w13 = LOAD_BIG_32(blk + 4 * 13); SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13); /* LINTED E_BAD_PTR_CAST_ALIGN */ w14 = LOAD_BIG_32(blk + 4 * 14); SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14); /* LINTED E_BAD_PTR_CAST_ALIGN */ w15 = LOAD_BIG_32(blk + 4 * 15); SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15); w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0); w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1); w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2); w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3); w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4); w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5); w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6); w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7); w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8); w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9); w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10); w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11); w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12); w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13); w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14); w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15); w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0); w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1); w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2); w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3); w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4); w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5); w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6); w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7); w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8); w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9); w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10); w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11); w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12); w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13); w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14); w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15); w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0); w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1); w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2); w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3); w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4); w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5); w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6); w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7); w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8); w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9); w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10); w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11); w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12); w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13); w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14); w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15); ctx->state.s32[0] += a; ctx->state.s32[1] += b; ctx->state.s32[2] += c; ctx->state.s32[3] += d; ctx->state.s32[4] += e; ctx->state.s32[5] += f; ctx->state.s32[6] += g; ctx->state.s32[7] += h; } /* SHA384 and SHA512 Transform */ static void SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk) { uint64_t a = ctx->state.s64[0]; uint64_t b = ctx->state.s64[1]; uint64_t c = ctx->state.s64[2]; uint64_t d = ctx->state.s64[3]; uint64_t e = ctx->state.s64[4]; uint64_t f = ctx->state.s64[5]; uint64_t g = ctx->state.s64[6]; uint64_t h = ctx->state.s64[7]; uint64_t w0, w1, w2, w3, w4, w5, w6, w7; uint64_t w8, w9, w10, w11, w12, w13, w14, w15; uint64_t T1, T2; #if defined(__sparc) static const uint64_t sha512_consts[] = { SHA512_CONST_0, SHA512_CONST_1, SHA512_CONST_2, SHA512_CONST_3, SHA512_CONST_4, SHA512_CONST_5, SHA512_CONST_6, SHA512_CONST_7, SHA512_CONST_8, SHA512_CONST_9, SHA512_CONST_10, SHA512_CONST_11, SHA512_CONST_12, SHA512_CONST_13, SHA512_CONST_14, SHA512_CONST_15, SHA512_CONST_16, SHA512_CONST_17, SHA512_CONST_18, SHA512_CONST_19, SHA512_CONST_20, SHA512_CONST_21, SHA512_CONST_22, SHA512_CONST_23, SHA512_CONST_24, SHA512_CONST_25, SHA512_CONST_26, SHA512_CONST_27, SHA512_CONST_28, SHA512_CONST_29, SHA512_CONST_30, SHA512_CONST_31, SHA512_CONST_32, SHA512_CONST_33, SHA512_CONST_34, SHA512_CONST_35, SHA512_CONST_36, SHA512_CONST_37, SHA512_CONST_38, SHA512_CONST_39, SHA512_CONST_40, SHA512_CONST_41, SHA512_CONST_42, SHA512_CONST_43, SHA512_CONST_44, SHA512_CONST_45, SHA512_CONST_46, SHA512_CONST_47, SHA512_CONST_48, SHA512_CONST_49, SHA512_CONST_50, SHA512_CONST_51, SHA512_CONST_52, SHA512_CONST_53, SHA512_CONST_54, SHA512_CONST_55, SHA512_CONST_56, SHA512_CONST_57, SHA512_CONST_58, SHA512_CONST_59, SHA512_CONST_60, SHA512_CONST_61, SHA512_CONST_62, SHA512_CONST_63, SHA512_CONST_64, SHA512_CONST_65, SHA512_CONST_66, SHA512_CONST_67, SHA512_CONST_68, SHA512_CONST_69, SHA512_CONST_70, SHA512_CONST_71, SHA512_CONST_72, SHA512_CONST_73, SHA512_CONST_74, SHA512_CONST_75, SHA512_CONST_76, SHA512_CONST_77, SHA512_CONST_78, SHA512_CONST_79 }; #endif /* __sparc */ if ((uintptr_t)blk & 0x7) { /* not 8-byte aligned? */ memcpy(ctx->buf_un.buf64, blk, sizeof (ctx->buf_un.buf64)); blk = (uint8_t *)ctx->buf_un.buf64; } /* LINTED E_BAD_PTR_CAST_ALIGN */ w0 = LOAD_BIG_64(blk + 8 * 0); SHA512ROUND(a, b, c, d, e, f, g, h, 0, w0); /* LINTED E_BAD_PTR_CAST_ALIGN */ w1 = LOAD_BIG_64(blk + 8 * 1); SHA512ROUND(h, a, b, c, d, e, f, g, 1, w1); /* LINTED E_BAD_PTR_CAST_ALIGN */ w2 = LOAD_BIG_64(blk + 8 * 2); SHA512ROUND(g, h, a, b, c, d, e, f, 2, w2); /* LINTED E_BAD_PTR_CAST_ALIGN */ w3 = LOAD_BIG_64(blk + 8 * 3); SHA512ROUND(f, g, h, a, b, c, d, e, 3, w3); /* LINTED E_BAD_PTR_CAST_ALIGN */ w4 = LOAD_BIG_64(blk + 8 * 4); SHA512ROUND(e, f, g, h, a, b, c, d, 4, w4); /* LINTED E_BAD_PTR_CAST_ALIGN */ w5 = LOAD_BIG_64(blk + 8 * 5); SHA512ROUND(d, e, f, g, h, a, b, c, 5, w5); /* LINTED E_BAD_PTR_CAST_ALIGN */ w6 = LOAD_BIG_64(blk + 8 * 6); SHA512ROUND(c, d, e, f, g, h, a, b, 6, w6); /* LINTED E_BAD_PTR_CAST_ALIGN */ w7 = LOAD_BIG_64(blk + 8 * 7); SHA512ROUND(b, c, d, e, f, g, h, a, 7, w7); /* LINTED E_BAD_PTR_CAST_ALIGN */ w8 = LOAD_BIG_64(blk + 8 * 8); SHA512ROUND(a, b, c, d, e, f, g, h, 8, w8); /* LINTED E_BAD_PTR_CAST_ALIGN */ w9 = LOAD_BIG_64(blk + 8 * 9); SHA512ROUND(h, a, b, c, d, e, f, g, 9, w9); /* LINTED E_BAD_PTR_CAST_ALIGN */ w10 = LOAD_BIG_64(blk + 8 * 10); SHA512ROUND(g, h, a, b, c, d, e, f, 10, w10); /* LINTED E_BAD_PTR_CAST_ALIGN */ w11 = LOAD_BIG_64(blk + 8 * 11); SHA512ROUND(f, g, h, a, b, c, d, e, 11, w11); /* LINTED E_BAD_PTR_CAST_ALIGN */ w12 = LOAD_BIG_64(blk + 8 * 12); SHA512ROUND(e, f, g, h, a, b, c, d, 12, w12); /* LINTED E_BAD_PTR_CAST_ALIGN */ w13 = LOAD_BIG_64(blk + 8 * 13); SHA512ROUND(d, e, f, g, h, a, b, c, 13, w13); /* LINTED E_BAD_PTR_CAST_ALIGN */ w14 = LOAD_BIG_64(blk + 8 * 14); SHA512ROUND(c, d, e, f, g, h, a, b, 14, w14); /* LINTED E_BAD_PTR_CAST_ALIGN */ w15 = LOAD_BIG_64(blk + 8 * 15); SHA512ROUND(b, c, d, e, f, g, h, a, 15, w15); w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; SHA512ROUND(a, b, c, d, e, f, g, h, 16, w0); w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; SHA512ROUND(h, a, b, c, d, e, f, g, 17, w1); w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; SHA512ROUND(g, h, a, b, c, d, e, f, 18, w2); w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; SHA512ROUND(f, g, h, a, b, c, d, e, 19, w3); w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; SHA512ROUND(e, f, g, h, a, b, c, d, 20, w4); w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; SHA512ROUND(d, e, f, g, h, a, b, c, 21, w5); w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; SHA512ROUND(c, d, e, f, g, h, a, b, 22, w6); w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; SHA512ROUND(b, c, d, e, f, g, h, a, 23, w7); w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; SHA512ROUND(a, b, c, d, e, f, g, h, 24, w8); w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; SHA512ROUND(h, a, b, c, d, e, f, g, 25, w9); w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; SHA512ROUND(g, h, a, b, c, d, e, f, 26, w10); w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; SHA512ROUND(f, g, h, a, b, c, d, e, 27, w11); w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; SHA512ROUND(e, f, g, h, a, b, c, d, 28, w12); w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; SHA512ROUND(d, e, f, g, h, a, b, c, 29, w13); w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; SHA512ROUND(c, d, e, f, g, h, a, b, 30, w14); w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; SHA512ROUND(b, c, d, e, f, g, h, a, 31, w15); w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; SHA512ROUND(a, b, c, d, e, f, g, h, 32, w0); w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; SHA512ROUND(h, a, b, c, d, e, f, g, 33, w1); w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; SHA512ROUND(g, h, a, b, c, d, e, f, 34, w2); w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; SHA512ROUND(f, g, h, a, b, c, d, e, 35, w3); w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; SHA512ROUND(e, f, g, h, a, b, c, d, 36, w4); w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; SHA512ROUND(d, e, f, g, h, a, b, c, 37, w5); w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; SHA512ROUND(c, d, e, f, g, h, a, b, 38, w6); w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; SHA512ROUND(b, c, d, e, f, g, h, a, 39, w7); w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; SHA512ROUND(a, b, c, d, e, f, g, h, 40, w8); w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; SHA512ROUND(h, a, b, c, d, e, f, g, 41, w9); w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; SHA512ROUND(g, h, a, b, c, d, e, f, 42, w10); w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; SHA512ROUND(f, g, h, a, b, c, d, e, 43, w11); w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; SHA512ROUND(e, f, g, h, a, b, c, d, 44, w12); w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; SHA512ROUND(d, e, f, g, h, a, b, c, 45, w13); w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; SHA512ROUND(c, d, e, f, g, h, a, b, 46, w14); w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; SHA512ROUND(b, c, d, e, f, g, h, a, 47, w15); w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; SHA512ROUND(a, b, c, d, e, f, g, h, 48, w0); w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; SHA512ROUND(h, a, b, c, d, e, f, g, 49, w1); w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; SHA512ROUND(g, h, a, b, c, d, e, f, 50, w2); w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; SHA512ROUND(f, g, h, a, b, c, d, e, 51, w3); w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; SHA512ROUND(e, f, g, h, a, b, c, d, 52, w4); w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; SHA512ROUND(d, e, f, g, h, a, b, c, 53, w5); w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; SHA512ROUND(c, d, e, f, g, h, a, b, 54, w6); w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; SHA512ROUND(b, c, d, e, f, g, h, a, 55, w7); w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; SHA512ROUND(a, b, c, d, e, f, g, h, 56, w8); w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; SHA512ROUND(h, a, b, c, d, e, f, g, 57, w9); w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; SHA512ROUND(g, h, a, b, c, d, e, f, 58, w10); w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; SHA512ROUND(f, g, h, a, b, c, d, e, 59, w11); w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; SHA512ROUND(e, f, g, h, a, b, c, d, 60, w12); w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; SHA512ROUND(d, e, f, g, h, a, b, c, 61, w13); w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; SHA512ROUND(c, d, e, f, g, h, a, b, 62, w14); w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; SHA512ROUND(b, c, d, e, f, g, h, a, 63, w15); w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; SHA512ROUND(a, b, c, d, e, f, g, h, 64, w0); w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; SHA512ROUND(h, a, b, c, d, e, f, g, 65, w1); w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; SHA512ROUND(g, h, a, b, c, d, e, f, 66, w2); w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; SHA512ROUND(f, g, h, a, b, c, d, e, 67, w3); w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; SHA512ROUND(e, f, g, h, a, b, c, d, 68, w4); w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; SHA512ROUND(d, e, f, g, h, a, b, c, 69, w5); w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; SHA512ROUND(c, d, e, f, g, h, a, b, 70, w6); w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; SHA512ROUND(b, c, d, e, f, g, h, a, 71, w7); w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; SHA512ROUND(a, b, c, d, e, f, g, h, 72, w8); w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; SHA512ROUND(h, a, b, c, d, e, f, g, 73, w9); w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; SHA512ROUND(g, h, a, b, c, d, e, f, 74, w10); w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; SHA512ROUND(f, g, h, a, b, c, d, e, 75, w11); w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; SHA512ROUND(e, f, g, h, a, b, c, d, 76, w12); w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; SHA512ROUND(d, e, f, g, h, a, b, c, 77, w13); w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; SHA512ROUND(c, d, e, f, g, h, a, b, 78, w14); w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; SHA512ROUND(b, c, d, e, f, g, h, a, 79, w15); ctx->state.s64[0] += a; ctx->state.s64[1] += b; ctx->state.s64[2] += c; ctx->state.s64[3] += d; ctx->state.s64[4] += e; ctx->state.s64[5] += f; ctx->state.s64[6] += g; ctx->state.s64[7] += h; } #endif /* !__amd64 || !_KERNEL */ /* * Encode() * * purpose: to convert a list of numbers from little endian to big endian * input: uint8_t * : place to store the converted big endian numbers * uint32_t * : place to get numbers to convert from * size_t : the length of the input in bytes * output: void */ static void Encode(uint8_t *_RESTRICT_KYWD output, uint32_t *_RESTRICT_KYWD input, size_t len) { size_t i, j; #if defined(__sparc) if (IS_P2ALIGNED(output, sizeof (uint32_t))) { for (i = 0, j = 0; j < len; i++, j += 4) { /* LINTED E_BAD_PTR_CAST_ALIGN */ *((uint32_t *)(output + j)) = input[i]; } } else { #endif /* little endian -- will work on big endian, but slowly */ for (i = 0, j = 0; j < len; i++, j += 4) { output[j] = (input[i] >> 24) & 0xff; output[j + 1] = (input[i] >> 16) & 0xff; output[j + 2] = (input[i] >> 8) & 0xff; output[j + 3] = input[i] & 0xff; } #if defined(__sparc) } #endif } static void Encode64(uint8_t *_RESTRICT_KYWD output, uint64_t *_RESTRICT_KYWD input, size_t len) { size_t i, j; #if defined(__sparc) if (IS_P2ALIGNED(output, sizeof (uint64_t))) { for (i = 0, j = 0; j < len; i++, j += 8) { /* LINTED E_BAD_PTR_CAST_ALIGN */ *((uint64_t *)(output + j)) = input[i]; } } else { #endif /* little endian -- will work on big endian, but slowly */ for (i = 0, j = 0; j < len; i++, j += 8) { output[j] = (input[i] >> 56) & 0xff; output[j + 1] = (input[i] >> 48) & 0xff; output[j + 2] = (input[i] >> 40) & 0xff; output[j + 3] = (input[i] >> 32) & 0xff; output[j + 4] = (input[i] >> 24) & 0xff; output[j + 5] = (input[i] >> 16) & 0xff; output[j + 6] = (input[i] >> 8) & 0xff; output[j + 7] = input[i] & 0xff; } #if defined(__sparc) } #endif } void SHA2Init(uint64_t mech, SHA2_CTX *ctx) { switch (mech) { case SHA256_MECH_INFO_TYPE: case SHA256_HMAC_MECH_INFO_TYPE: case SHA256_HMAC_GEN_MECH_INFO_TYPE: ctx->state.s32[0] = 0x6a09e667U; ctx->state.s32[1] = 0xbb67ae85U; ctx->state.s32[2] = 0x3c6ef372U; ctx->state.s32[3] = 0xa54ff53aU; ctx->state.s32[4] = 0x510e527fU; ctx->state.s32[5] = 0x9b05688cU; ctx->state.s32[6] = 0x1f83d9abU; ctx->state.s32[7] = 0x5be0cd19U; break; case SHA384_MECH_INFO_TYPE: case SHA384_HMAC_MECH_INFO_TYPE: case SHA384_HMAC_GEN_MECH_INFO_TYPE: ctx->state.s64[0] = 0xcbbb9d5dc1059ed8ULL; ctx->state.s64[1] = 0x629a292a367cd507ULL; ctx->state.s64[2] = 0x9159015a3070dd17ULL; ctx->state.s64[3] = 0x152fecd8f70e5939ULL; ctx->state.s64[4] = 0x67332667ffc00b31ULL; ctx->state.s64[5] = 0x8eb44a8768581511ULL; ctx->state.s64[6] = 0xdb0c2e0d64f98fa7ULL; ctx->state.s64[7] = 0x47b5481dbefa4fa4ULL; break; case SHA512_MECH_INFO_TYPE: case SHA512_HMAC_MECH_INFO_TYPE: case SHA512_HMAC_GEN_MECH_INFO_TYPE: ctx->state.s64[0] = 0x6a09e667f3bcc908ULL; ctx->state.s64[1] = 0xbb67ae8584caa73bULL; ctx->state.s64[2] = 0x3c6ef372fe94f82bULL; ctx->state.s64[3] = 0xa54ff53a5f1d36f1ULL; ctx->state.s64[4] = 0x510e527fade682d1ULL; ctx->state.s64[5] = 0x9b05688c2b3e6c1fULL; ctx->state.s64[6] = 0x1f83d9abfb41bd6bULL; ctx->state.s64[7] = 0x5be0cd19137e2179ULL; break; case SHA512_224_MECH_INFO_TYPE: ctx->state.s64[0] = 0x8C3D37C819544DA2ULL; ctx->state.s64[1] = 0x73E1996689DCD4D6ULL; ctx->state.s64[2] = 0x1DFAB7AE32FF9C82ULL; ctx->state.s64[3] = 0x679DD514582F9FCFULL; ctx->state.s64[4] = 0x0F6D2B697BD44DA8ULL; ctx->state.s64[5] = 0x77E36F7304C48942ULL; ctx->state.s64[6] = 0x3F9D85A86A1D36C8ULL; ctx->state.s64[7] = 0x1112E6AD91D692A1ULL; break; case SHA512_256_MECH_INFO_TYPE: ctx->state.s64[0] = 0x22312194FC2BF72CULL; ctx->state.s64[1] = 0x9F555FA3C84C64C2ULL; ctx->state.s64[2] = 0x2393B86B6F53B151ULL; ctx->state.s64[3] = 0x963877195940EABDULL; ctx->state.s64[4] = 0x96283EE2A88EFFE3ULL; ctx->state.s64[5] = 0xBE5E1E2553863992ULL; ctx->state.s64[6] = 0x2B0199FC2C85B8AAULL; ctx->state.s64[7] = 0x0EB72DDC81C52CA2ULL; break; #ifdef _KERNEL default: cmn_err(CE_PANIC, "sha2_init: failed to find a supported algorithm: 0x%x", (uint32_t)mech); #endif /* _KERNEL */ } ctx->algotype = (uint32_t)mech; ctx->count.c64[0] = ctx->count.c64[1] = 0; } #ifndef _KERNEL // #pragma inline(SHA256Init, SHA384Init, SHA512Init) void SHA256Init(SHA256_CTX *ctx) { SHA2Init(SHA256, ctx); } void SHA384Init(SHA384_CTX *ctx) { SHA2Init(SHA384, ctx); } void SHA512Init(SHA512_CTX *ctx) { SHA2Init(SHA512, ctx); } #endif /* _KERNEL */ /* * SHA2Update() * * purpose: continues an sha2 digest operation, using the message block * to update the context. * input: SHA2_CTX * : the context to update * void * : the message block * size_t : the length of the message block, in bytes * output: void */ void SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len) { uint32_t i, buf_index, buf_len, buf_limit; const uint8_t *input = inptr; uint32_t algotype = ctx->algotype; /* check for noop */ if (input_len == 0) return; if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { buf_limit = 64; /* compute number of bytes mod 64 */ buf_index = (ctx->count.c32[1] >> 3) & 0x3F; /* update number of bits */ if ((ctx->count.c32[1] += (input_len << 3)) < (input_len << 3)) ctx->count.c32[0]++; ctx->count.c32[0] += (input_len >> 29); } else { buf_limit = 128; /* compute number of bytes mod 128 */ buf_index = (ctx->count.c64[1] >> 3) & 0x7F; /* update number of bits */ if ((ctx->count.c64[1] += (input_len << 3)) < (input_len << 3)) ctx->count.c64[0]++; ctx->count.c64[0] += (input_len >> 29); } buf_len = buf_limit - buf_index; /* transform as many times as possible */ i = 0; if (input_len >= buf_len) { /* * general optimization: * * only do initial memcpy() and SHA2Transform() if * buf_index != 0. if buf_index == 0, we're just * wasting our time doing the memcpy() since there * wasn't any data left over from a previous call to * SHA2Update(). */ if (buf_index) { memcpy(&ctx->buf_un.buf8[buf_index], input, buf_len); if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) SHA256Transform(ctx, ctx->buf_un.buf8); else SHA512Transform(ctx, ctx->buf_un.buf8); i = buf_len; } #if !defined(__amd64) || !defined(_KERNEL) if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { for (; i + buf_limit - 1 < input_len; i += buf_limit) { SHA256Transform(ctx, &input[i]); } } else { for (; i + buf_limit - 1 < input_len; i += buf_limit) { SHA512Transform(ctx, &input[i]); } } #else uint32_t block_count; if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { block_count = (input_len - i) >> 6; if (block_count > 0) { SHA256TransformBlocks(ctx, &input[i], block_count); i += block_count << 6; } } else { block_count = (input_len - i) >> 7; if (block_count > 0) { SHA512TransformBlocks(ctx, &input[i], block_count); i += block_count << 7; } } #endif /* !__amd64 || !_KERNEL */ /* * general optimization: * * if i and input_len are the same, return now instead * of calling memcpy(), since the memcpy() in this case * will be an expensive noop. */ if (input_len == i) return; buf_index = 0; } /* buffer remaining input */ memcpy(&ctx->buf_un.buf8[buf_index], &input[i], input_len - i); } /* * SHA2Final() * * purpose: ends an sha2 digest operation, finalizing the message digest and * zeroing the context. * input: uchar_t * : a buffer to store the digest * : The function actually uses void* because many * : callers pass things other than uchar_t here. * SHA2_CTX * : the context to finalize, save, and zero * output: void */ void SHA2Final(void *digest, SHA2_CTX *ctx) { uint8_t bitcount_be[sizeof (ctx->count.c32)]; uint8_t bitcount_be64[sizeof (ctx->count.c64)]; uint32_t index; uint32_t algotype = ctx->algotype; if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { index = (ctx->count.c32[1] >> 3) & 0x3f; Encode(bitcount_be, ctx->count.c32, sizeof (bitcount_be)); SHA2Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index); SHA2Update(ctx, bitcount_be, sizeof (bitcount_be)); Encode(digest, ctx->state.s32, sizeof (ctx->state.s32)); } else { index = (ctx->count.c64[1] >> 3) & 0x7f; Encode64(bitcount_be64, ctx->count.c64, sizeof (bitcount_be64)); SHA2Update(ctx, PADDING, ((index < 112) ? 112 : 240) - index); SHA2Update(ctx, bitcount_be64, sizeof (bitcount_be64)); if (algotype <= SHA384_HMAC_GEN_MECH_INFO_TYPE) { ctx->state.s64[6] = ctx->state.s64[7] = 0; Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 6); } else if (algotype == SHA512_224_MECH_INFO_TYPE) { uint8_t last[sizeof (uint64_t)]; /* * Since SHA-512/224 doesn't align well to 64-bit * boundaries, we must do the encoding in three steps: * 1) encode the three 64-bit words that fit neatly * 2) encode the last 64-bit word to a temp buffer * 3) chop out the lower 32-bits from the temp buffer * and append them to the digest */ Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 3); Encode64(last, &ctx->state.s64[3], sizeof (uint64_t)); memcpy((uint8_t *)digest + 24, last, 4); } else if (algotype == SHA512_256_MECH_INFO_TYPE) { Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 4); } else { Encode64(digest, ctx->state.s64, sizeof (ctx->state.s64)); } } /* zeroize sensitive information */ memset(ctx, 0, sizeof (*ctx)); } #ifdef _KERNEL EXPORT_SYMBOL(SHA2Init); EXPORT_SYMBOL(SHA2Update); EXPORT_SYMBOL(SHA2Final); #endif diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S index a0525dd464f5..d5cf4040fb93 100644 --- a/module/icp/asm-x86_64/aes/aes_amd64.S +++ b/module/icp/asm-x86_64/aes/aes_amd64.S @@ -1,908 +1,908 @@ /* * --------------------------------------------------------------------------- * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. * * LICENSE TERMS * * The free distribution and use of this software is allowed (with or without * changes) provided that: * * 1. source code distributions include the above copyright notice, this * list of conditions and the following disclaimer; * * 2. binary distributions include the above copyright notice, this list * of conditions and the following disclaimer in their documentation; * * 3. the name of the copyright holder is not used to endorse products * built using this software without specific written permission. * * DISCLAIMER * * This software is provided 'as is' with no explicit or implied warranties * in respect of its properties, including, but not limited to, correctness * and/or fitness for purpose. * --------------------------------------------------------------------------- * Issue 20/12/2007 * * I am grateful to Dag Arne Osvik for many discussions of the techniques that * can be used to optimise AES assembler code on AMD64/EM64T architectures. * Some of the techniques used in this implementation are the result of * suggestions made by him for which I am most grateful. * * An AES implementation for AMD64 processors using the YASM assembler. This * implementation provides only encryption, decryption and hence requires key * scheduling support in C. It uses 8k bytes of tables but its encryption and * decryption performance is very close to that obtained using large tables. * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, * which are as follows: * ms windows gnu/linux/opensolaris os * * in_blk rcx rdi * out_blk rdx rsi * context (cx) r8 rdx * * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 * registers rdi - on both * * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 * registers - rdi on both * * The convention used here is that for gnu/linux/opensolaris os. * * This code provides the standard AES block size (128 bits, 16 bytes) and the * three standard AES key sizes (128, 192 and 256 bits). It has the same call * interface as my C implementation. It uses the Microsoft C AMD64 calling * conventions in which the three parameters are placed in rcx, rdx and r8 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. * * OpenSolaris Note: * Modified to use GNU/Linux/Solaris calling conventions. * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. * * AES_RETURN aes_encrypt(const unsigned char in_blk[], * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt(const unsigned char in_blk[], * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_encrypt_key(const unsigned char key[], * const aes_encrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt_key(const unsigned char key[], * const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_encrypt_key(const unsigned char key[], * unsigned int len, const aes_decrypt_ctx cx[1])/ * * AES_RETURN aes_decrypt_key(const unsigned char key[], * unsigned int len, const aes_decrypt_ctx cx[1])/ * * where is 128, 102 or 256. In the last two calls the length can be in * either bits or bytes. * * Comment in/out the following lines to obtain the desired subroutines. These * selections MUST match those in the C header file aesopt.h */ #define AES_REV_DKS /* define if key decryption schedule is reversed */ #define LAST_ROUND_TABLES /* define for the faster version using extra tables */ /* * The encryption key schedule has the following in memory layout where N is the * number of rounds (10, 12 or 14): * * lo: | input key (round 0) | / each round is four 32-bit words * | encryption round 1 | * | encryption round 2 | * .... * | encryption round N-1 | * hi: | encryption round N | * * The decryption key schedule is normally set up so that it has the same * layout as above by actually reversing the order of the encryption key * schedule in memory (this happens when AES_REV_DKS is set): * * lo: | decryption round 0 | = | encryption round N | * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] * .... .... * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] * hi: | decryption round N | = | input key (round 0) | * * with rounds except the first and last modified using inv_mix_column() * But if AES_REV_DKS is NOT set the order of keys is left as it is for * encryption so that it has to be accessed in reverse when used for * decryption (although the inverse mix column modifications are done) * * lo: | decryption round 0 | = | input key (round 0) | * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] * .... .... * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] * hi: | decryption round N | = | encryption round N | * * This layout is faster when the assembler key scheduling provided here * is used. * * End of user defines */ /* * --------------------------------------------------------------------------- * OpenSolaris OS modifications * * This source originates from Brian Gladman file aes_amd64.asm * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip * with these changes: * * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, * AES_128, AES_192, AES_256, AES_VAR ifdefs. * * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define * * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef * * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax * (operands reversed, literals prefixed with "$", registers prefixed with "%", * and "[register+offset]", addressing changed to "offset(register)", * parenthesis in constant expressions "()" changed to square brackets "[]", * "." removed from local (numeric) labels, and other changes. * Examples: * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax * mov rax,(4*20h) mov $[4*0x20],%rax * mov rax,[ebx+20h] mov 0x20(%ebx),%rax * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax * * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function * definitions for lint. * * 6. Renamed functions and reordered parameters to match OpenSolaris: * Original Gladman interface: * int aes_encrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ * int aes_decrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, * and a union type, inf., containing inf.l, a uint32_t and * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is * used and contains the key schedule length * 16 where key schedule length is * 10, 12, or 14 bytes. * * OpenSolaris OS interface: * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, * ct is crypto text, and MAX_AES_NR is 14. * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. */ #if defined(lint) || defined(__lint) #include void aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], - uint32_t ct[4]) { - (void) rk, (void) Nr, (void) pt, (void) ct; + uint32_t ct[4]) { + (void) rk, (void) Nr, (void) pt, (void) ct; } void aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], - uint32_t pt[4]) { - (void) rk, (void) Nr, (void) pt, (void) ct; + uint32_t pt[4]) { + (void) rk, (void) Nr, (void) pt, (void) ct; } #else #define _ASM #include #define KS_LENGTH 60 #define raxd eax #define rdxd edx #define rcxd ecx #define rbxd ebx #define rsid esi #define rdid edi #define raxb al #define rdxb dl #define rcxb cl #define rbxb bl #define rsib sil #define rdib dil // finite field multiplies by {02}, {04} and {08} -#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] -#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] -#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] +#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) +#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) +#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) // finite field multiplies required in table generation -#define f3(x) [[f2(x)] ^ [x]] -#define f9(x) [[f8(x)] ^ [x]] -#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] -#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] -#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] +#define f3(x) ((f2(x)) ^ (x)) +#define f9(x) ((f8(x)) ^ (x)) +#define fb(x) ((f8(x)) ^ (f2(x)) ^ (x)) +#define fd(x) ((f8(x)) ^ (f4(x)) ^ (x)) +#define fe(x) ((f8(x)) ^ (f4(x)) ^ (f2(x))) // macros for expanding S-box data -#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] -#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] -#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 +#define u8(x) (f2(x)), (x), (x), (f3(x)), (f2(x)), (x), (x), (f3(x)) +#define v8(x) (fe(x)), (f9(x)), (fd(x)), (fb(x)), (fe(x)), (f9(x)), (fd(x)), (x) +#define w8(x) (x), 0, 0, 0, (x), 0, 0, 0 #define enc_vals(x) \ .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) #define dec_vals(x) \ .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) #define tptr %rbp /* table pointer */ #define kptr %r8 /* key schedule pointer */ #define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ #define fk_ref(x, y) -16*x+fofs+4*y(kptr) #ifdef AES_REV_DKS #define rofs 128 #define ik_ref(x, y) -16*x+rofs+4*y(kptr) #else #define rofs -128 #define ik_ref(x, y) 16*x+rofs+4*y(kptr) #endif /* AES_REV_DKS */ #define tab_0(x) (tptr,x,8) #define tab_1(x) 3(tptr,x,8) #define tab_2(x) 2(tptr,x,8) #define tab_3(x) 1(tptr,x,8) #define tab_f(x) 1(tptr,x,8) #define tab_i(x) 7(tptr,x,8) #define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p1; \ \ mov p1, %eax; \ mov p2, %ebx; \ mov p3, %ecx; \ mov p4, %edx #ifdef LAST_ROUND_TABLES #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ add $2048, tptr; \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p1 #else #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ mov fk_ref(round,0), p1; \ mov fk_ref(round,1), p2; \ mov fk_ref(round,2), p3; \ mov fk_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ xor %esi, p1; \ rol $8, %edi; \ xor %edi, p4; \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p3; \ xor %edi, p2; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ xor %esi, p2; \ rol $8, %edi; \ xor %edi, p1; \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p4; \ xor %edi, p3; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ shr $16, %ecx; \ xor %esi, p3; \ rol $8, %edi; \ xor %edi, p2; \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p1; \ xor %edi, p4; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ shr $16, %edx; \ xor %esi, p4; \ rol $8, %edi; \ xor %edi, p3; \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_f(%rsi), %esi; \ movzx tab_f(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p2; \ xor %edi, p1 #endif /* LAST_ROUND_TABLES */ #define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p3; \ \ mov p1, %eax; \ mov p2, %ebx; \ mov p3, %ecx; \ mov p4, %edx #ifdef LAST_ROUND_TABLES #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ add $2048, tptr; \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ shr $16, %eax; \ xor tab_0(%rsi), p1; \ xor tab_1(%rdi), p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ xor tab_2(%rsi), p3; \ xor tab_3(%rdi), p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ shr $16, %ebx; \ xor tab_0(%rsi), p2; \ xor tab_1(%rdi), p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ xor tab_2(%rsi), p4; \ xor tab_3(%rdi), p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ shr $16, %ecx; \ xor tab_0(%rsi), p3; \ xor tab_1(%rdi), p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ xor tab_2(%rsi), p1; \ xor tab_3(%rdi), p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ shr $16, %edx; \ xor tab_0(%rsi), p4; \ xor tab_1(%rdi), p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ xor tab_2(%rsi), p2; \ xor tab_3(%rdi), p3 #else #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ mov ik_ref(round,0), p1; \ mov ik_ref(round,1), p2; \ mov ik_ref(round,2), p3; \ mov ik_ref(round,3), p4; \ \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %eax; \ xor %esi, p1; \ rol $8, %edi; \ xor %edi, p2; \ movzx %al, %esi; \ movzx %ah, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p3; \ xor %edi, p4; \ \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %ebx; \ xor %esi, p2; \ rol $8, %edi; \ xor %edi, p3; \ movzx %bl, %esi; \ movzx %bh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p4; \ xor %edi, p1; \ \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %ecx; \ xor %esi, p3; \ rol $8, %edi; \ xor %edi, p4; \ movzx %cl, %esi; \ movzx %ch, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p1; \ xor %edi, p2; \ \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ shr $16, %edx; \ xor %esi, p4; \ rol $8, %edi; \ xor %edi, p1; \ movzx %dl, %esi; \ movzx %dh, %edi; \ movzx tab_i(%rsi), %esi; \ movzx tab_i(%rdi), %edi; \ rol $16, %esi; \ rol $24, %edi; \ xor %esi, p2; \ xor %edi, p3 #endif /* LAST_ROUND_TABLES */ /* * OpenSolaris OS: * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * * Original interface: * int aes_encrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ */ -.section .rodata +SECTION_STATIC .align 64 enc_tab: enc_vals(u8) #ifdef LAST_ROUND_TABLES // Last Round Tables: enc_vals(w8) #endif ENTRY_NP(aes_encrypt_amd64) ENDBR #ifdef GLADMAN_INTERFACE // Original interface sub $[4*8], %rsp // gnu/linux/opensolaris binary interface mov %rsi, (%rsp) // output pointer (P2) mov %rdx, %r8 // context (P3) mov %rbx, 1*8(%rsp) // P1: input pointer in rdi mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) mov %r12, 3*8(%rsp) // P3: context in r8 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 #else // OpenSolaris OS interface - sub $[4*8], %rsp // Make room on stack to save registers + sub $(4*8), %rsp // Make room on stack to save registers mov %rcx, (%rsp) // Save output pointer (P4) on stack mov %rdi, %r8 // context (P1) mov %rdx, %rdi // P3: save input pointer shl $4, %esi // P2: esi byte key length * 16 mov %rbx, 1*8(%rsp) // Save registers mov %rbp, 2*8(%rsp) mov %r12, 3*8(%rsp) // P1: context in r8 // P2: byte key length * 16 in esi // P3: input pointer in rdi // P4: output pointer in (rsp) #endif /* GLADMAN_INTERFACE */ lea enc_tab(%rip), tptr sub $fofs, kptr // Load input block into registers mov (%rdi), %eax mov 1*4(%rdi), %ebx mov 2*4(%rdi), %ecx mov 3*4(%rdi), %edx xor fofs(kptr), %eax xor fofs+4(kptr), %ebx xor fofs+8(kptr), %ecx xor fofs+12(kptr), %edx lea (kptr,%rsi), kptr // Jump based on byte key length * 16: - cmp $[10*16], %esi + cmp $(10*16), %esi je 3f - cmp $[12*16], %esi + cmp $(12*16), %esi je 2f - cmp $[14*16], %esi + cmp $(14*16), %esi je 1f mov $-1, %rax // error jmp 4f // Perform normal forward rounds 1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) 2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) 3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) // Copy results mov (%rsp), %rbx mov %r9d, (%rbx) mov %r10d, 4(%rbx) mov %r11d, 8(%rbx) mov %r12d, 12(%rbx) xor %rax, %rax 4: // Restore registers mov 1*8(%rsp), %rbx mov 2*8(%rsp), %rbp mov 3*8(%rsp), %r12 - add $[4*8], %rsp + add $(4*8), %rsp RET SET_SIZE(aes_encrypt_amd64) /* * OpenSolaris OS: * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, * const uint32_t pt[4], uint32_t ct[4])/ * * Original interface: * int aes_decrypt(const unsigned char *in, * unsigned char *out, const aes_encrypt_ctx cx[1])/ */ -.section .rodata +SECTION_STATIC .align 64 dec_tab: dec_vals(v8) #ifdef LAST_ROUND_TABLES // Last Round Tables: dec_vals(w8) #endif ENTRY_NP(aes_decrypt_amd64) ENDBR #ifdef GLADMAN_INTERFACE // Original interface sub $[4*8], %rsp // gnu/linux/opensolaris binary interface mov %rsi, (%rsp) // output pointer (P2) mov %rdx, %r8 // context (P3) mov %rbx, 1*8(%rsp) // P1: input pointer in rdi mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) mov %r12, 3*8(%rsp) // P3: context in r8 movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 #else // OpenSolaris OS interface - sub $[4*8], %rsp // Make room on stack to save registers + sub $(4*8), %rsp // Make room on stack to save registers mov %rcx, (%rsp) // Save output pointer (P4) on stack mov %rdi, %r8 // context (P1) mov %rdx, %rdi // P3: save input pointer shl $4, %esi // P2: esi byte key length * 16 mov %rbx, 1*8(%rsp) // Save registers mov %rbp, 2*8(%rsp) mov %r12, 3*8(%rsp) // P1: context in r8 // P2: byte key length * 16 in esi // P3: input pointer in rdi // P4: output pointer in (rsp) #endif /* GLADMAN_INTERFACE */ lea dec_tab(%rip), tptr sub $rofs, kptr // Load input block into registers mov (%rdi), %eax mov 1*4(%rdi), %ebx mov 2*4(%rdi), %ecx mov 3*4(%rdi), %edx #ifdef AES_REV_DKS mov kptr, %rdi lea (kptr,%rsi), kptr #else lea (kptr,%rsi), %rdi #endif xor rofs(%rdi), %eax xor rofs+4(%rdi), %ebx xor rofs+8(%rdi), %ecx xor rofs+12(%rdi), %edx // Jump based on byte key length * 16: - cmp $[10*16], %esi + cmp $(10*16), %esi je 3f - cmp $[12*16], %esi + cmp $(12*16), %esi je 2f - cmp $[14*16], %esi + cmp $(14*16), %esi je 1f mov $-1, %rax // error jmp 4f // Perform normal inverse rounds 1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) 2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) 3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) il_rnd(%r9d, %r10d, %r11d, %r12d, 0) // Copy results mov (%rsp), %rbx mov %r9d, (%rbx) mov %r10d, 4(%rbx) mov %r11d, 8(%rbx) mov %r12d, 12(%rbx) xor %rax, %rax 4: // Restore registers mov 1*8(%rsp), %rbx mov 2*8(%rsp), %rbp mov 3*8(%rsp), %r12 - add $[4*8], %rsp + add $(4*8), %rsp RET SET_SIZE(aes_decrypt_amd64) -#endif /* lint || __lint */ +#endif /* lint || __lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S index cb08430b81ed..8f9e766486f1 100644 --- a/module/icp/asm-x86_64/blake3/blake3_avx2.S +++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S @@ -1,1835 +1,1829 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves * Copyright (c) 2022 Tino Reichardt */ #if defined(HAVE_AVX2) #define _ASM #include .intel_syntax noprefix -.global zfs_blake3_hash_many_avx2 .text -.type zfs_blake3_hash_many_avx2,@function -.p2align 6 -zfs_blake3_hash_many_avx2: +ENTRY_ALIGN(zfs_blake3_hash_many_avx2, 64) ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 680 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d vmovd xmm0, r9d vpbroadcastd ymm0, xmm0 vmovdqa ymmword ptr [rsp+0x280], ymm0 vpand ymm1, ymm0, ymmword ptr [ADD0+rip] vpand ymm2, ymm0, ymmword ptr [ADD1+rip] vmovdqa ymmword ptr [rsp+0x220], ymm2 vmovd xmm2, r8d vpbroadcastd ymm2, xmm2 vpaddd ymm2, ymm2, ymm1 vmovdqa ymmword ptr [rsp+0x240], ymm2 vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm1, ymm2 shr r8, 32 vmovd xmm3, r8d vpbroadcastd ymm3, xmm3 vpsubd ymm3, ymm3, ymm2 vmovdqa ymmword ptr [rsp+0x260], ymm3 shl rdx, 6 mov qword ptr [rsp+0x2A0], rdx cmp rsi, 8 jc 3f 2: vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x2A0] cmove eax, ebx mov dword ptr [rsp+0x200], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x20], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x40], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x60], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x80], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0xA0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0xC0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0xE0], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x100], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x120], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x140], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x160], ymm11 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm8, ymm12, ymm14, 136 vmovaps ymmword ptr [rsp+0x180], ymm8 vshufps ymm9, ymm12, ymm14, 221 vmovaps ymmword ptr [rsp+0x1A0], ymm9 vshufps ymm10, ymm13, ymm15, 136 vmovaps ymmword ptr [rsp+0x1C0], ymm10 vshufps ymm11, ymm13, ymm15, 221 vmovaps ymmword ptr [rsp+0x1E0], ymm11 vpbroadcastd ymm15, dword ptr [rsp+0x200] prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] vpxor ymm15, ymm3, ymm15 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] vpaddd ymm1, ymm1, ymmword ptr [rsp] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxor ymm12, ymm12, ymm0 vpxor ymm13, ymm13, ymm1 vpxor ymm14, ymm14, ymm2 vpxor ymm15, ymm15, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpshufb ymm15, ymm15, ymm8 vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxor ymm4, ymm4, ymm8 vpxor ymm5, ymm5, ymm9 vpxor ymm6, ymm6, ymm10 vpxor ymm7, ymm7, ymm11 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vmovdqa ymmword ptr [rsp+0x200], ymm8 vpsrld ymm8, ymm5, 12 vpslld ymm5, ymm5, 20 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 12 vpslld ymm6, ymm6, 20 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 12 vpslld ymm7, ymm7, 20 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 12 vpslld ymm4, ymm4, 20 vpor ymm4, ymm4, ymm8 vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxor ymm15, ymm15, ymm0 vpxor ymm12, ymm12, ymm1 vpxor ymm13, ymm13, ymm2 vpxor ymm14, ymm14, ymm3 vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] vpshufb ymm15, ymm15, ymm8 vpshufb ymm12, ymm12, ymm8 vpshufb ymm13, ymm13, ymm8 vpshufb ymm14, ymm14, ymm8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] vpaddd ymm9, ymm9, ymm14 vpxor ymm5, ymm5, ymm10 vpxor ymm6, ymm6, ymm11 vpxor ymm7, ymm7, ymm8 vpxor ymm4, ymm4, ymm9 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpsrld ymm8, ymm5, 7 vpslld ymm5, ymm5, 25 vpor ymm5, ymm5, ymm8 vpsrld ymm8, ymm6, 7 vpslld ymm6, ymm6, 25 vpor ymm6, ymm6, ymm8 vpsrld ymm8, ymm7, 7 vpslld ymm7, ymm7, 25 vpor ymm7, ymm7, ymm8 vpsrld ymm8, ymm4, 7 vpslld ymm4, ymm4, 25 vpor ymm4, ymm4, ymm8 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x38] jne 9b mov rbx, qword ptr [rbp+0x50] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp+0x220] vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] vmovdqa ymmword ptr [rsp+0x240], ymm1 vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] vpcmpgtd ymm2, ymm0, ymm2 vmovdqa ymm0, ymmword ptr [rsp+0x260] vpsubd ymm2, ymm0, ymm2 vmovdqa ymmword ptr [rsp+0x260], ymm2 add rdi, 64 add rbx, 256 mov qword ptr [rbp+0x50], rbx sub rsi, 8 cmp rsi, 8 jnc 2b test rsi, rsi jnz 3f 4: vzeroupper mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 RET .p2align 5 3: mov rbx, qword ptr [rbp+0x50] mov r15, qword ptr [rsp+0x2A0] movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] test rsi, 0x4 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovdqa ymm8, ymm0 vmovdqa ymm9, ymm1 vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] vpunpckldq ymm14, ymm12, ymm13 vpunpckhdq ymm15, ymm12, ymm13 vpermq ymm14, ymm14, 0x50 vpermq ymm15, ymm15, 0x50 vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] vpblendd ymm14, ymm14, ymm12, 0x44 vpblendd ymm15, ymm15, ymm12, 0x44 vmovdqa ymmword ptr [rsp], ymm14 vmovdqa ymmword ptr [rsp+0x20], ymm15 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vmovups ymm2, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm2, ymm3, 136 vshufps ymm5, ymm2, ymm3, 221 vmovups ymm2, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm3, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm2, ymm3, 136 vshufps ymm7, ymm2, ymm3, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 vmovups ymm10, ymmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 vshufps ymm12, ymm10, ymm11, 136 vshufps ymm13, ymm10, ymm11, 221 vmovups ymm10, ymmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 vmovups ymm11, ymmword ptr [r10+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 vshufps ymm14, ymm10, ymm11, 136 vshufps ymm15, ymm10, ymm11, 221 vpshufd ymm14, ymm14, 0x93 vpshufd ymm15, ymm15, 0x93 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] vpbroadcastd ymm2, dword ptr [rsp+0x200] vmovdqa ymm3, ymmword ptr [rsp] vmovdqa ymm11, ymmword ptr [rsp+0x20] vpblendd ymm3, ymm3, ymm2, 0x88 vpblendd ymm11, ymm11, ymm2, 0x88 vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa ymm10, ymm2 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm8, ymm8, ymm12 vmovdqa ymmword ptr [rsp+0x40], ymm4 nop vmovdqa ymmword ptr [rsp+0x60], ymm12 nop vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm5 vpaddd ymm8, ymm8, ymm13 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vmovdqa ymmword ptr [rsp+0x80], ymm5 vmovdqa ymmword ptr [rsp+0xA0], ymm13 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x93 vpshufd ymm8, ymm8, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x39 vpshufd ymm10, ymm10, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm8, ymm8, ymm14 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 12 vpslld ymm9, ymm9, 20 vpor ymm9, ymm9, ymm4 vpaddd ymm0, ymm0, ymm7 vpaddd ymm8, ymm8, ymm15 vpaddd ymm0, ymm0, ymm1 vpaddd ymm8, ymm8, ymm9 vpxor ymm3, ymm3, ymm0 vpxor ymm11, ymm11, ymm8 vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] vpshufb ymm3, ymm3, ymm4 vpshufb ymm11, ymm11, ymm4 vpaddd ymm2, ymm2, ymm3 vpaddd ymm10, ymm10, ymm11 vpxor ymm1, ymm1, ymm2 vpxor ymm9, ymm9, ymm10 vpsrld ymm4, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm4 vpsrld ymm4, ymm9, 7 vpslld ymm9, ymm9, 25 vpor ymm9, ymm9, ymm4 vpshufd ymm0, ymm0, 0x39 vpshufd ymm8, ymm8, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm11, ymm11, 0x4E vpshufd ymm2, ymm2, 0x93 vpshufd ymm10, ymm10, 0x93 dec al je 9f vmovdqa ymm4, ymmword ptr [rsp+0x40] vmovdqa ymm5, ymmword ptr [rsp+0x80] vshufps ymm12, ymm4, ymm5, 214 vpshufd ymm13, ymm4, 0x0F vpshufd ymm4, ymm12, 0x39 vshufps ymm12, ymm6, ymm7, 250 vpblendd ymm13, ymm13, ymm12, 0xAA vpunpcklqdq ymm12, ymm7, ymm5 vpblendd ymm12, ymm12, ymm6, 0x88 vpshufd ymm12, ymm12, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymmword ptr [rsp+0x40], ymm13 vmovdqa ymmword ptr [rsp+0x80], ymm12 vmovdqa ymm12, ymmword ptr [rsp+0x60] vmovdqa ymm13, ymmword ptr [rsp+0xA0] vshufps ymm5, ymm12, ymm13, 214 vpshufd ymm6, ymm12, 0x0F vpshufd ymm12, ymm5, 0x39 vshufps ymm5, ymm14, ymm15, 250 vpblendd ymm6, ymm6, ymm5, 0xAA vpunpcklqdq ymm5, ymm15, ymm13 vpblendd ymm5, ymm5, ymm14, 0x88 vpshufd ymm5, ymm5, 0x78 vpunpckhdq ymm13, ymm13, ymm15 vpunpckldq ymm14, ymm14, ymm13 vpshufd ymm15, ymm14, 0x1E vmovdqa ymm13, ymm6 vmovdqa ymm14, ymm5 vmovdqa ymm5, ymmword ptr [rsp+0x40] vmovdqa ymm6, ymmword ptr [rsp+0x80] jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 vpxor ymm8, ymm8, ymm10 vpxor ymm9, ymm9, ymm11 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqu xmmword ptr [rbx+0x40], xmm8 vmovdqu xmmword ptr [rbx+0x50], xmm9 vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 vmovaps xmm8, xmmword ptr [rsp+0x280] vmovaps xmm0, xmmword ptr [rsp+0x240] vmovaps xmm1, xmmword ptr [rsp+0x250] vmovaps xmm2, xmmword ptr [rsp+0x260] vmovaps xmm3, xmmword ptr [rsp+0x270] vblendvps xmm0, xmm0, xmm1, xmm8 vblendvps xmm2, xmm2, xmm3, xmm8 vmovaps xmmword ptr [rsp+0x240], xmm0 vmovaps xmmword ptr [rsp+0x260], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test rsi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp+0x240] vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x244] vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x200], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x200] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm14 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 12 vpslld ymm1, ymm1, 20 vpor ymm1, ymm1, ymm8 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxor ymm3, ymm3, ymm0 vpshufb ymm3, ymm3, ymm15 vpaddd ymm2, ymm2, ymm3 vpxor ymm1, ymm1, ymm2 vpsrld ymm8, ymm1, 7 vpslld ymm1, ymm1, 25 vpor ymm1, ymm1, ymm8 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovaps ymm8, ymmword ptr [rsp+0x280] vmovaps ymm0, ymmword ptr [rsp+0x240] vmovups ymm1, ymmword ptr [rsp+0x248] vmovaps ymm2, ymmword ptr [rsp+0x260] vmovups ymm3, ymmword ptr [rsp+0x268] vblendvps ymm0, ymm0, ymm1, ymm8 vblendvps ymm2, ymm2, ymm3, ymm8 vmovaps ymmword ptr [rsp+0x240], ymm0 vmovaps ymmword ptr [rsp+0x260], ymm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test rsi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm3, dword ptr [rsp+0x240] vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm14, xmmword ptr [ROT16+rip] vmovdqa xmm15, xmmword ptr [ROT8+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] vmovdqa xmm3, xmm13 vpinsrd xmm3, xmm3, eax, 3 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm14 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 12 vpslld xmm1, xmm1, 20 vpor xmm1, xmm1, xmm8 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, xmm15 vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm8, xmm1, 7 vpslld xmm1, xmm1, 25 vpor xmm1, xmm1, xmm8 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b -.size zfs_blake3_hash_many_avx2, . - zfs_blake3_hash_many_avx2 +SET_SIZE(zfs_blake3_hash_many_avx2) -#ifdef __APPLE__ -.static_data -#else +SECTION_STATIC .section .rodata -#endif .p2align 6 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 ADD1: .long 8, 8, 8, 8, 8, 8, 8, 8 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A #endif /* HAVE_AVX2 */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/blake3/blake3_avx512.S b/module/icp/asm-x86_64/blake3/blake3_avx512.S index 960406ea2c01..3b4cf94b657c 100644 --- a/module/icp/asm-x86_64/blake3/blake3_avx512.S +++ b/module/icp/asm-x86_64/blake3/blake3_avx512.S @@ -1,2608 +1,2595 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves * Copyright (c) 2022 Tino Reichardt */ #if defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) #define _ASM #include .intel_syntax noprefix -.global zfs_blake3_hash_many_avx512 -.global zfs_blake3_compress_in_place_avx512 -.global zfs_blake3_compress_xof_avx512 .text -.type zfs_blake3_hash_many_avx512,@function -.type zfs_blake3_compress_xof_avx512,@function -.type zfs_blake3_compress_in_place_avx512,@function - -.p2align 6 -zfs_blake3_hash_many_avx512: +ENTRY_ALIGN(zfs_blake3_hash_many_avx512, 64) ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 144 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9 kmovw k1, r9d vmovd xmm0, r8d vpbroadcastd ymm0, xmm0 shr r8, 32 vmovd xmm1, r8d vpbroadcastd ymm1, xmm1 vmovdqa ymm4, ymm1 vmovdqa ymm5, ymm1 vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] vpcmpltud k2, ymm2, ymm0 vpcmpltud k3, ymm3, ymm0 vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} knotw k2, k1 vmovdqa32 ymm2 {k2}, ymm0 vmovdqa32 ymm3 {k2}, ymm0 vmovdqa32 ymm4 {k2}, ymm1 vmovdqa32 ymm5 {k2}, ymm1 vmovdqa ymmword ptr [rsp], ymm2 vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 shl rdx, 6 mov qword ptr [rsp+0x80], rdx cmp rsi, 16 jc 3f 2: vpbroadcastd zmm0, dword ptr [rcx] vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx .p2align 5 9: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm8, zmm16, zmm17 vpunpckhqdq zmm9, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm10, zmm18, zmm19 vpunpckhqdq zmm11, zmm18, zmm19 mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 vpunpcklqdq zmm12, zmm16, zmm17 vpunpckhqdq zmm13, zmm16, zmm17 vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 vpunpcklqdq zmm14, zmm18, zmm19 vpunpckhqdq zmm15, zmm18, zmm19 vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] vshufps zmm16, zmm8, zmm10, 136 vshufps zmm17, zmm12, zmm14, 136 vmovdqa32 zmm20, zmm16 vpermt2d zmm16, zmm27, zmm17 vpermt2d zmm20, zmm31, zmm17 vshufps zmm17, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm21, zmm17 vpermt2d zmm17, zmm27, zmm30 vpermt2d zmm21, zmm31, zmm30 vshufps zmm18, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm22, zmm18 vpermt2d zmm18, zmm27, zmm8 vpermt2d zmm22, zmm31, zmm8 vshufps zmm19, zmm9, zmm11, 221 vshufps zmm8, zmm13, zmm15, 221 vmovdqa32 zmm23, zmm19 vpermt2d zmm19, zmm27, zmm8 vpermt2d zmm23, zmm31, zmm8 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x40] mov r13, qword ptr [rdi+0x48] mov r14, qword ptr [rdi+0x50] mov r15, qword ptr [rdi+0x58] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm8, zmm24, zmm25 vpunpckhqdq zmm9, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm10, zmm24, zmm25 vpunpckhqdq zmm11, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] mov r8, qword ptr [rdi+0x20] mov r9, qword ptr [rdi+0x28] mov r10, qword ptr [rdi+0x30] mov r11, qword ptr [rdi+0x38] mov r12, qword ptr [rdi+0x60] mov r13, qword ptr [rdi+0x68] mov r14, qword ptr [rdi+0x70] mov r15, qword ptr [rdi+0x78] vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm12, zmm24, zmm25 vpunpckhqdq zmm13, zmm24, zmm25 vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 vpunpcklqdq zmm14, zmm24, zmm25 vpunpckhqdq zmm15, zmm24, zmm25 prefetcht0 [r8+rdx+0x80] prefetcht0 [r12+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r13+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r14+rdx+0x80] prefetcht0 [r11+rdx+0x80] prefetcht0 [r15+rdx+0x80] vshufps zmm24, zmm8, zmm10, 136 vshufps zmm30, zmm12, zmm14, 136 vmovdqa32 zmm28, zmm24 vpermt2d zmm24, zmm27, zmm30 vpermt2d zmm28, zmm31, zmm30 vshufps zmm25, zmm8, zmm10, 221 vshufps zmm30, zmm12, zmm14, 221 vmovdqa32 zmm29, zmm25 vpermt2d zmm25, zmm27, zmm30 vpermt2d zmm29, zmm31, zmm30 vshufps zmm26, zmm9, zmm11, 136 vshufps zmm8, zmm13, zmm15, 136 vmovdqa32 zmm30, zmm26 vpermt2d zmm26, zmm27, zmm8 vpermt2d zmm30, zmm31, zmm8 vshufps zmm8, zmm9, zmm11, 221 vshufps zmm10, zmm13, zmm15, 221 vpermi2d zmm27, zmm8, zmm10 vpermi2d zmm31, zmm8, zmm10 vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa32 zmm12, zmmword ptr [rsp] vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm24 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm23 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm17 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm29 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm22 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm27 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm21 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm30 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm20 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm21 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm16 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm28 vpaddd zmm1, zmm1, zmm25 vpaddd zmm2, zmm2, zmm31 vpaddd zmm3, zmm3, zmm30 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm26 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm23 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm16 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm18 vpaddd zmm1, zmm1, zmm19 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm25 vpaddd zmm1, zmm1, zmm27 vpaddd zmm2, zmm2, zmm24 vpaddd zmm3, zmm3, zmm31 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm28 vpaddd zmm3, zmm3, zmm17 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm29 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm18 vpaddd zmm3, zmm3, zmm20 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm19 vpaddd zmm1, zmm1, zmm26 vpaddd zmm2, zmm2, zmm22 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpaddd zmm0, zmm0, zmm27 vpaddd zmm1, zmm1, zmm21 vpaddd zmm2, zmm2, zmm17 vpaddd zmm3, zmm3, zmm24 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vprord zmm15, zmm15, 16 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 12 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vpaddd zmm0, zmm0, zmm31 vpaddd zmm1, zmm1, zmm16 vpaddd zmm2, zmm2, zmm25 vpaddd zmm3, zmm3, zmm22 vpaddd zmm0, zmm0, zmm4 vpaddd zmm1, zmm1, zmm5 vpaddd zmm2, zmm2, zmm6 vpaddd zmm3, zmm3, zmm7 vpxord zmm12, zmm12, zmm0 vpxord zmm13, zmm13, zmm1 vpxord zmm14, zmm14, zmm2 vpxord zmm15, zmm15, zmm3 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vprord zmm15, zmm15, 8 vpaddd zmm8, zmm8, zmm12 vpaddd zmm9, zmm9, zmm13 vpaddd zmm10, zmm10, zmm14 vpaddd zmm11, zmm11, zmm15 vpxord zmm4, zmm4, zmm8 vpxord zmm5, zmm5, zmm9 vpxord zmm6, zmm6, zmm10 vpxord zmm7, zmm7, zmm11 vprord zmm4, zmm4, 7 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vpaddd zmm0, zmm0, zmm30 vpaddd zmm1, zmm1, zmm18 vpaddd zmm2, zmm2, zmm19 vpaddd zmm3, zmm3, zmm23 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 16 vprord zmm12, zmm12, 16 vprord zmm13, zmm13, 16 vprord zmm14, zmm14, 16 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 12 vprord zmm6, zmm6, 12 vprord zmm7, zmm7, 12 vprord zmm4, zmm4, 12 vpaddd zmm0, zmm0, zmm26 vpaddd zmm1, zmm1, zmm28 vpaddd zmm2, zmm2, zmm20 vpaddd zmm3, zmm3, zmm29 vpaddd zmm0, zmm0, zmm5 vpaddd zmm1, zmm1, zmm6 vpaddd zmm2, zmm2, zmm7 vpaddd zmm3, zmm3, zmm4 vpxord zmm15, zmm15, zmm0 vpxord zmm12, zmm12, zmm1 vpxord zmm13, zmm13, zmm2 vpxord zmm14, zmm14, zmm3 vprord zmm15, zmm15, 8 vprord zmm12, zmm12, 8 vprord zmm13, zmm13, 8 vprord zmm14, zmm14, 8 vpaddd zmm10, zmm10, zmm15 vpaddd zmm11, zmm11, zmm12 vpaddd zmm8, zmm8, zmm13 vpaddd zmm9, zmm9, zmm14 vpxord zmm5, zmm5, zmm10 vpxord zmm6, zmm6, zmm11 vpxord zmm7, zmm7, zmm8 vpxord zmm4, zmm4, zmm9 vprord zmm5, zmm5, 7 vprord zmm6, zmm6, 7 vprord zmm7, zmm7, 7 vprord zmm4, zmm4, 7 vpxord zmm0, zmm0, zmm8 vpxord zmm1, zmm1, zmm9 vpxord zmm2, zmm2, zmm10 vpxord zmm3, zmm3, zmm11 vpxord zmm4, zmm4, zmm12 vpxord zmm5, zmm5, zmm13 vpxord zmm6, zmm6, zmm14 vpxord zmm7, zmm7, zmm15 movzx eax, byte ptr [rbp+0x38] jne 9b mov rbx, qword ptr [rbp+0x50] vpunpckldq zmm16, zmm0, zmm1 vpunpckhdq zmm17, zmm0, zmm1 vpunpckldq zmm18, zmm2, zmm3 vpunpckhdq zmm19, zmm2, zmm3 vpunpckldq zmm20, zmm4, zmm5 vpunpckhdq zmm21, zmm4, zmm5 vpunpckldq zmm22, zmm6, zmm7 vpunpckhdq zmm23, zmm6, zmm7 vpunpcklqdq zmm0, zmm16, zmm18 vpunpckhqdq zmm1, zmm16, zmm18 vpunpcklqdq zmm2, zmm17, zmm19 vpunpckhqdq zmm3, zmm17, zmm19 vpunpcklqdq zmm4, zmm20, zmm22 vpunpckhqdq zmm5, zmm20, zmm22 vpunpcklqdq zmm6, zmm21, zmm23 vpunpckhqdq zmm7, zmm21, zmm23 vshufi32x4 zmm16, zmm0, zmm4, 0x88 vshufi32x4 zmm17, zmm1, zmm5, 0x88 vshufi32x4 zmm18, zmm2, zmm6, 0x88 vshufi32x4 zmm19, zmm3, zmm7, 0x88 vshufi32x4 zmm20, zmm0, zmm4, 0xDD vshufi32x4 zmm21, zmm1, zmm5, 0xDD vshufi32x4 zmm22, zmm2, zmm6, 0xDD vshufi32x4 zmm23, zmm3, zmm7, 0xDD vshufi32x4 zmm0, zmm16, zmm17, 0x88 vshufi32x4 zmm1, zmm18, zmm19, 0x88 vshufi32x4 zmm2, zmm20, zmm21, 0x88 vshufi32x4 zmm3, zmm22, zmm23, 0x88 vshufi32x4 zmm4, zmm16, zmm17, 0xDD vshufi32x4 zmm5, zmm18, zmm19, 0xDD vshufi32x4 zmm6, zmm20, zmm21, 0xDD vshufi32x4 zmm7, zmm22, zmm23, 0xDD vmovdqu32 zmmword ptr [rbx], zmm0 vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 vmovdqa32 zmm0, zmmword ptr [rsp] vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] vmovdqa32 zmm2, zmm0 vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} vpcmpltud k2, zmm2, zmm0 vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} vmovdqa32 zmmword ptr [rsp], zmm2 vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 add rdi, 128 add rbx, 512 mov qword ptr [rbp+0x50], rbx sub rsi, 16 cmp rsi, 16 jnc 2b test rsi, rsi jnz 3f 4: vzeroupper mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 RET .p2align 6 3: test esi, 0x8 je 3f vpbroadcastd ymm0, dword ptr [rcx] vpbroadcastd ymm1, dword ptr [rcx+0x4] vpbroadcastd ymm2, dword ptr [rcx+0x8] vpbroadcastd ymm3, dword ptr [rcx+0xC] vpbroadcastd ymm4, dword ptr [rcx+0x10] vpbroadcastd ymm5, dword ptr [rcx+0x14] vpbroadcastd ymm6, dword ptr [rcx+0x18] vpbroadcastd ymm7, dword ptr [rcx+0x1C] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov r12, qword ptr [rdi+0x20] mov r13, qword ptr [rdi+0x28] mov r14, qword ptr [rdi+0x30] mov r15, qword ptr [rdi+0x38] movzx eax, byte ptr [rbp+0x38] movzx ebx, byte ptr [rbp+0x40] or eax, ebx xor edx, edx 2: movzx ebx, byte ptr [rbp+0x48] or ebx, eax add rdx, 64 cmp rdx, qword ptr [rsp+0x80] cmove eax, ebx mov dword ptr [rsp+0x88], eax vmovups xmm8, xmmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x40] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x40] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x40] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm16, ymm12, ymm14, 136 vshufps ymm17, ymm12, ymm14, 221 vshufps ymm18, ymm13, ymm15, 136 vshufps ymm19, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x30] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x30] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x30] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm20, ymm12, ymm14, 136 vshufps ymm21, ymm12, ymm14, 221 vshufps ymm22, ymm13, ymm15, 136 vshufps ymm23, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x20] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x20] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x20] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm24, ymm12, ymm14, 136 vshufps ymm25, ymm12, ymm14, 221 vshufps ymm26, ymm13, ymm15, 136 vshufps ymm27, ymm13, ymm15, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x10] vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 vmovups xmm9, xmmword ptr [r9+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 vunpcklpd ymm12, ymm8, ymm9 vunpckhpd ymm13, ymm8, ymm9 vmovups xmm10, xmmword ptr [r10+rdx-0x10] vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 vmovups xmm11, xmmword ptr [r11+rdx-0x10] vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 vunpcklpd ymm14, ymm10, ymm11 vunpckhpd ymm15, ymm10, ymm11 vshufps ymm28, ymm12, ymm14, 136 vshufps ymm29, ymm12, ymm14, 221 vshufps ymm30, ymm13, ymm15, 136 vshufps ymm31, ymm13, ymm15, 221 vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] vmovdqa ymm12, ymmword ptr [rsp] vmovdqa ymm13, ymmword ptr [rsp+0x40] vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] vpbroadcastd ymm15, dword ptr [rsp+0x88] vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm24 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm23 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm17 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm29 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm22 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm27 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm21 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm30 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm20 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm21 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm16 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm28 vpaddd ymm1, ymm1, ymm25 vpaddd ymm2, ymm2, ymm31 vpaddd ymm3, ymm3, ymm30 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm26 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm23 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm16 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm18 vpaddd ymm1, ymm1, ymm19 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm25 vpaddd ymm1, ymm1, ymm27 vpaddd ymm2, ymm2, ymm24 vpaddd ymm3, ymm3, ymm31 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm28 vpaddd ymm3, ymm3, ymm17 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm29 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm18 vpaddd ymm3, ymm3, ymm20 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm19 vpaddd ymm1, ymm1, ymm26 vpaddd ymm2, ymm2, ymm22 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpaddd ymm0, ymm0, ymm27 vpaddd ymm1, ymm1, ymm21 vpaddd ymm2, ymm2, ymm17 vpaddd ymm3, ymm3, ymm24 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vprord ymm15, ymm15, 16 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 12 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vpaddd ymm0, ymm0, ymm31 vpaddd ymm1, ymm1, ymm16 vpaddd ymm2, ymm2, ymm25 vpaddd ymm3, ymm3, ymm22 vpaddd ymm0, ymm0, ymm4 vpaddd ymm1, ymm1, ymm5 vpaddd ymm2, ymm2, ymm6 vpaddd ymm3, ymm3, ymm7 vpxord ymm12, ymm12, ymm0 vpxord ymm13, ymm13, ymm1 vpxord ymm14, ymm14, ymm2 vpxord ymm15, ymm15, ymm3 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vprord ymm15, ymm15, 8 vpaddd ymm8, ymm8, ymm12 vpaddd ymm9, ymm9, ymm13 vpaddd ymm10, ymm10, ymm14 vpaddd ymm11, ymm11, ymm15 vpxord ymm4, ymm4, ymm8 vpxord ymm5, ymm5, ymm9 vpxord ymm6, ymm6, ymm10 vpxord ymm7, ymm7, ymm11 vprord ymm4, ymm4, 7 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vpaddd ymm0, ymm0, ymm30 vpaddd ymm1, ymm1, ymm18 vpaddd ymm2, ymm2, ymm19 vpaddd ymm3, ymm3, ymm23 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 16 vprord ymm12, ymm12, 16 vprord ymm13, ymm13, 16 vprord ymm14, ymm14, 16 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 12 vprord ymm6, ymm6, 12 vprord ymm7, ymm7, 12 vprord ymm4, ymm4, 12 vpaddd ymm0, ymm0, ymm26 vpaddd ymm1, ymm1, ymm28 vpaddd ymm2, ymm2, ymm20 vpaddd ymm3, ymm3, ymm29 vpaddd ymm0, ymm0, ymm5 vpaddd ymm1, ymm1, ymm6 vpaddd ymm2, ymm2, ymm7 vpaddd ymm3, ymm3, ymm4 vpxord ymm15, ymm15, ymm0 vpxord ymm12, ymm12, ymm1 vpxord ymm13, ymm13, ymm2 vpxord ymm14, ymm14, ymm3 vprord ymm15, ymm15, 8 vprord ymm12, ymm12, 8 vprord ymm13, ymm13, 8 vprord ymm14, ymm14, 8 vpaddd ymm10, ymm10, ymm15 vpaddd ymm11, ymm11, ymm12 vpaddd ymm8, ymm8, ymm13 vpaddd ymm9, ymm9, ymm14 vpxord ymm5, ymm5, ymm10 vpxord ymm6, ymm6, ymm11 vpxord ymm7, ymm7, ymm8 vpxord ymm4, ymm4, ymm9 vprord ymm5, ymm5, 7 vprord ymm6, ymm6, 7 vprord ymm7, ymm7, 7 vprord ymm4, ymm4, 7 vpxor ymm0, ymm0, ymm8 vpxor ymm1, ymm1, ymm9 vpxor ymm2, ymm2, ymm10 vpxor ymm3, ymm3, ymm11 vpxor ymm4, ymm4, ymm12 vpxor ymm5, ymm5, ymm13 vpxor ymm6, ymm6, ymm14 vpxor ymm7, ymm7, ymm15 movzx eax, byte ptr [rbp+0x38] jne 2b mov rbx, qword ptr [rbp+0x50] vunpcklps ymm8, ymm0, ymm1 vunpcklps ymm9, ymm2, ymm3 vunpckhps ymm10, ymm0, ymm1 vunpcklps ymm11, ymm4, ymm5 vunpcklps ymm0, ymm6, ymm7 vshufps ymm12, ymm8, ymm9, 78 vblendps ymm1, ymm8, ymm12, 0xCC vshufps ymm8, ymm11, ymm0, 78 vunpckhps ymm13, ymm2, ymm3 vblendps ymm2, ymm11, ymm8, 0xCC vblendps ymm3, ymm12, ymm9, 0xCC vperm2f128 ymm12, ymm1, ymm2, 0x20 vmovups ymmword ptr [rbx], ymm12 vunpckhps ymm14, ymm4, ymm5 vblendps ymm4, ymm8, ymm0, 0xCC vunpckhps ymm15, ymm6, ymm7 vperm2f128 ymm7, ymm3, ymm4, 0x20 vmovups ymmword ptr [rbx+0x20], ymm7 vshufps ymm5, ymm10, ymm13, 78 vblendps ymm6, ymm5, ymm13, 0xCC vshufps ymm13, ymm14, ymm15, 78 vblendps ymm10, ymm10, ymm5, 0xCC vblendps ymm14, ymm14, ymm13, 0xCC vperm2f128 ymm8, ymm10, ymm14, 0x20 vmovups ymmword ptr [rbx+0x40], ymm8 vblendps ymm15, ymm13, ymm15, 0xCC vperm2f128 ymm13, ymm6, ymm15, 0x20 vmovups ymmword ptr [rbx+0x60], ymm13 vperm2f128 ymm9, ymm1, ymm2, 0x31 vperm2f128 ymm11, ymm3, ymm4, 0x31 vmovups ymmword ptr [rbx+0x80], ymm9 vperm2f128 ymm14, ymm10, ymm14, 0x31 vperm2f128 ymm15, ymm6, ymm15, 0x31 vmovups ymmword ptr [rbx+0xA0], ymm11 vmovups ymmword ptr [rbx+0xC0], ymm14 vmovups ymmword ptr [rbx+0xE0], ymm15 vmovdqa ymm0, ymmword ptr [rsp] vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] vmovdqa ymmword ptr [rsp], ymm0 vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 add rbx, 256 mov qword ptr [rbp+0x50], rbx add rdi, 64 sub rsi, 8 3: mov rbx, qword ptr [rbp+0x50] mov r15, qword ptr [rsp+0x80] movzx r13, byte ptr [rbp+0x38] movzx r12, byte ptr [rbp+0x48] test esi, 0x4 je 3f vbroadcasti32x4 zmm0, xmmword ptr [rcx] vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] vmovdqa xmm12, xmmword ptr [rsp] vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] vpunpckldq xmm14, xmm12, xmm13 vpunpckhdq xmm15, xmm12, xmm13 vpermq ymm14, ymm14, 0xDC vpermq ymm15, ymm15, 0xDC vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] vinserti32x8 zmm13, zmm14, ymm15, 0x01 mov eax, 17476 kmovw k2, eax vpblendmd zmm13 {k2}, zmm13, zmm12 vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] mov eax, 43690 kmovw k3, eax mov eax, 34952 kmovw k4, eax movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vmovdqa32 zmm2, zmm15 vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] vpblendmd zmm3 {k4}, zmm13, zmm8 vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x30] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 vshufps zmm4, zmm8, zmm9, 136 vshufps zmm5, zmm8, zmm9, 221 vmovups zmm8, zmmword ptr [r8+rdx-0x20] vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 vmovups zmm9, zmmword ptr [r8+rdx-0x10] vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 vshufps zmm6, zmm8, zmm9, 136 vshufps zmm7, zmm8, zmm9, 221 vpshufd zmm6, zmm6, 0x93 vpshufd zmm7, zmm7, 0x93 mov al, 7 9: vpaddd zmm0, zmm0, zmm4 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm5 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x93 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x39 vpaddd zmm0, zmm0, zmm6 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 16 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 12 vpaddd zmm0, zmm0, zmm7 vpaddd zmm0, zmm0, zmm1 vpxord zmm3, zmm3, zmm0 vprord zmm3, zmm3, 8 vpaddd zmm2, zmm2, zmm3 vpxord zmm1, zmm1, zmm2 vprord zmm1, zmm1, 7 vpshufd zmm0, zmm0, 0x39 vpshufd zmm3, zmm3, 0x4E vpshufd zmm2, zmm2, 0x93 dec al jz 9f vshufps zmm8, zmm4, zmm5, 214 vpshufd zmm9, zmm4, 0x0F vpshufd zmm4, zmm8, 0x39 vshufps zmm8, zmm6, zmm7, 250 vpblendmd zmm9 {k3}, zmm9, zmm8 vpunpcklqdq zmm8, zmm7, zmm5 vpblendmd zmm8 {k4}, zmm8, zmm6 vpshufd zmm8, zmm8, 0x78 vpunpckhdq zmm5, zmm5, zmm7 vpunpckldq zmm6, zmm6, zmm5 vpshufd zmm7, zmm6, 0x1E vmovdqa32 zmm5, zmm9 vmovdqa32 zmm6, zmm8 jmp 9b 9: vpxord zmm0, zmm0, zmm2 vpxord zmm1, zmm1, zmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x40] vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x40], xmm2 add rbx, 128 add rdi, 32 sub rsi, 4 3: test esi, 0x2 je 3f vbroadcasti128 ymm0, xmmword ptr [rcx] vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] vmovd xmm13, dword ptr [rsp] vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovd xmm14, dword ptr [rsp+0x4] vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vinserti128 ymm13, ymm13, xmm14, 0x01 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d mov dword ptr [rsp+0x88], eax vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] vpbroadcastd ymm8, dword ptr [rsp+0x88] vpblendd ymm3, ymm13, ymm8, 0x88 vmovups ymm8, ymmword ptr [r8+rdx-0x40] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x30] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 vshufps ymm4, ymm8, ymm9, 136 vshufps ymm5, ymm8, ymm9, 221 vmovups ymm8, ymmword ptr [r8+rdx-0x20] vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 vmovups ymm9, ymmword ptr [r8+rdx-0x10] vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 vshufps ymm6, ymm8, ymm9, 136 vshufps ymm7, ymm8, ymm9, 221 vpshufd ymm6, ymm6, 0x93 vpshufd ymm7, ymm7, 0x93 mov al, 7 9: vpaddd ymm0, ymm0, ymm4 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm5 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x93 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x39 vpaddd ymm0, ymm0, ymm6 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 16 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 12 vpaddd ymm0, ymm0, ymm7 vpaddd ymm0, ymm0, ymm1 vpxord ymm3, ymm3, ymm0 vprord ymm3, ymm3, 8 vpaddd ymm2, ymm2, ymm3 vpxord ymm1, ymm1, ymm2 vprord ymm1, ymm1, 7 vpshufd ymm0, ymm0, 0x39 vpshufd ymm3, ymm3, 0x4E vpshufd ymm2, ymm2, 0x93 dec al jz 9f vshufps ymm8, ymm4, ymm5, 214 vpshufd ymm9, ymm4, 0x0F vpshufd ymm4, ymm8, 0x39 vshufps ymm8, ymm6, ymm7, 250 vpblendd ymm9, ymm9, ymm8, 0xAA vpunpcklqdq ymm8, ymm7, ymm5 vpblendd ymm8, ymm8, ymm6, 0x88 vpshufd ymm8, ymm8, 0x78 vpunpckhdq ymm5, ymm5, ymm7 vpunpckldq ymm6, ymm6, ymm5 vpshufd ymm7, ymm6, 0x1E vmovdqa ymm5, ymm9 vmovdqa ymm6, ymm8 jmp 9b 9: vpxor ymm0, ymm0, ymm2 vpxor ymm1, ymm1, ymm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 vmovdqa xmm0, xmmword ptr [rsp] vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] vmovdqa xmmword ptr [rsp], xmm0 vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 add rbx, 64 add rdi, 16 sub rsi, 2 3: test esi, 0x1 je 4b vmovdqu xmm0, xmmword ptr [rcx] vmovdqu xmm1, xmmword ptr [rcx+0x10] vmovd xmm14, dword ptr [rsp] vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx .p2align 5 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d vpinsrd xmm3, xmm14, eax, 3 vmovdqa xmm2, xmm15 vmovups xmm8, xmmword ptr [r8+rdx-0x40] vmovups xmm9, xmmword ptr [r8+rdx-0x30] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [r8+rdx-0x20] vmovups xmm9, xmmword ptr [r8+rdx-0x10] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b vmovdqu xmmword ptr [rbx], xmm0 vmovdqu xmmword ptr [rbx+0x10], xmm1 jmp 4b -.p2align 6 -zfs_blake3_compress_in_place_avx512: + +ENTRY_ALIGN(zfs_blake3_compress_in_place_avx512, 64) ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax vmovq xmm3, rcx vmovq xmm4, rdx vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rsi] vmovups xmm9, xmmword ptr [rsi+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rsi+0x20] vmovups xmm9, xmmword ptr [rsi+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vmovdqu xmmword ptr [rdi], xmm0 vmovdqu xmmword ptr [rdi+0x10], xmm1 RET -.p2align 6 -zfs_blake3_compress_xof_avx512: +ENTRY_ALIGN(zfs_blake3_compress_xof_avx512, 64) ENDBR vmovdqu xmm0, xmmword ptr [rdi] vmovdqu xmm1, xmmword ptr [rdi+0x10] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax vmovq xmm3, rcx vmovq xmm4, rdx vpunpcklqdq xmm3, xmm3, xmm4 vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] vmovups xmm8, xmmword ptr [rsi] vmovups xmm9, xmmword ptr [rsi+0x10] vshufps xmm4, xmm8, xmm9, 136 vshufps xmm5, xmm8, xmm9, 221 vmovups xmm8, xmmword ptr [rsi+0x20] vmovups xmm9, xmmword ptr [rsi+0x30] vshufps xmm6, xmm8, xmm9, 136 vshufps xmm7, xmm8, xmm9, 221 vpshufd xmm6, xmm6, 0x93 vpshufd xmm7, xmm7, 0x93 mov al, 7 9: vpaddd xmm0, xmm0, xmm4 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm5 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x93 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x39 vpaddd xmm0, xmm0, xmm6 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 16 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 12 vpaddd xmm0, xmm0, xmm7 vpaddd xmm0, xmm0, xmm1 vpxord xmm3, xmm3, xmm0 vprord xmm3, xmm3, 8 vpaddd xmm2, xmm2, xmm3 vpxord xmm1, xmm1, xmm2 vprord xmm1, xmm1, 7 vpshufd xmm0, xmm0, 0x39 vpshufd xmm3, xmm3, 0x4E vpshufd xmm2, xmm2, 0x93 dec al jz 9f vshufps xmm8, xmm4, xmm5, 214 vpshufd xmm9, xmm4, 0x0F vpshufd xmm4, xmm8, 0x39 vshufps xmm8, xmm6, xmm7, 250 vpblendd xmm9, xmm9, xmm8, 0xAA vpunpcklqdq xmm8, xmm7, xmm5 vpblendd xmm8, xmm8, xmm6, 0x88 vpshufd xmm8, xmm8, 0x78 vpunpckhdq xmm5, xmm5, xmm7 vpunpckldq xmm6, xmm6, xmm5 vpshufd xmm7, xmm6, 0x1E vmovdqa xmm5, xmm9 vmovdqa xmm6, xmm8 jmp 9b 9: vpxor xmm0, xmm0, xmm2 vpxor xmm1, xmm1, xmm3 vpxor xmm2, xmm2, [rdi] vpxor xmm3, xmm3, [rdi+0x10] vmovdqu xmmword ptr [r9], xmm0 vmovdqu xmmword ptr [r9+0x10], xmm1 vmovdqu xmmword ptr [r9+0x20], xmm2 vmovdqu xmmword ptr [r9+0x30], xmm3 RET -.size zfs_blake3_hash_many_avx512, . - zfs_blake3_hash_many_avx512 -.size zfs_blake3_compress_in_place_avx512, . - zfs_blake3_compress_in_place_avx512 -.size zfs_blake3_compress_xof_avx512, . - zfs_blake3_compress_xof_avx512 +SET_SIZE(zfs_blake3_hash_many_avx512) +SET_SIZE(zfs_blake3_compress_in_place_avx512) +SET_SIZE(zfs_blake3_compress_xof_avx512) -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif +SECTION_STATIC .p2align 6 INDEX0: .long 0, 1, 2, 3, 16, 17, 18, 19 .long 8, 9, 10, 11, 24, 25, 26, 27 INDEX1: .long 4, 5, 6, 7, 20, 21, 22, 23 .long 12, 13, 14, 15, 28, 29, 30, 31 ADD0: .long 0, 1, 2, 3, 4, 5, 6, 7 .long 8, 9, 10, 11, 12, 13, 14, 15 ADD1: .long 1 ADD16: .long 16 BLAKE3_BLOCK_LEN: .long 64 .p2align 6 BLAKE3_IV: BLAKE3_IV_0: .long 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A #endif /* HAVE_AVX512 */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/blake3/blake3_sse2.S b/module/icp/asm-x86_64/blake3/blake3_sse2.S index c4290aaa8faf..4a07699f12cb 100644 --- a/module/icp/asm-x86_64/blake3/blake3_sse2.S +++ b/module/icp/asm-x86_64/blake3/blake3_sse2.S @@ -1,2313 +1,2300 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale * Copyright (c) 2022 Tino Reichardt */ #if defined(HAVE_SSE2) #define _ASM #include .intel_syntax noprefix -.global zfs_blake3_hash_many_sse2 -.global zfs_blake3_compress_in_place_sse2 -.global zfs_blake3_compress_xof_sse2 -.text -.type zfs_blake3_hash_many_sse2,@function -.type zfs_blake3_compress_in_place_sse2,@function -.type zfs_blake3_compress_xof_sse2,@function +SECTION_TEXT - .p2align 6 -zfs_blake3_hash_many_sse2: +ENTRY_ALIGN(zfs_blake3_hash_many_sse2, 64) ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 360 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x50] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 pshuflw xmm15, xmm15, 0xB1 pshufhw xmm15, xmm15, 0xB1 pshuflw xmm12, xmm12, 0xB1 pshufhw xmm12, xmm12, 0xB1 pshuflw xmm13, xmm13, 0xB1 pshufhw xmm13, xmm13, 0xB1 pshuflw xmm14, xmm14, 0xB1 pshufhw xmm14, xmm14, 0xB1 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmm15 psrld xmm15, 8 pslld xmm8, 24 pxor xmm15, xmm8 movdqa xmm8, xmm12 psrld xmm12, 8 pslld xmm8, 24 pxor xmm12, xmm8 movdqa xmm8, xmm13 psrld xmm13, 8 pslld xmm8, 24 pxor xmm13, xmm8 movdqa xmm8, xmm14 psrld xmm14, 8 pslld xmm8, 24 pxor xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jnz 3f 4: mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 RET .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] movd xmm13, dword ptr [rsp+0x124] punpckldq xmm14, xmm13 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 shl rax, 0x20 or rax, 0x40 movq xmm3, rax movdqa xmmword ptr [rsp+0x20], xmm3 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] punpcklqdq xmm3, xmmword ptr [rsp+0x20] punpcklqdq xmm11, xmmword ptr [rsp+0x20] mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 pshuflw xmm11, xmm11, 0xB1 pshufhw xmm11, xmm11, 0xB1 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movdqa xmm13, xmm3 psrld xmm3, 8 pslld xmm13, 24 pxor xmm3, xmm13 movdqa xmm13, xmm11 psrld xmm11, 8 pslld xmm13, 24 pxor xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm13, xmm12 movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 movdqa xmm13, xmm6 pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm12, xmm13 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm6, xmm5 movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 movdqa xmmword ptr [rsp+0x30], xmm2 movdqa xmm2, xmm14 pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm5, xmm2 movdqa xmm2, xmmword ptr [rsp+0x30] pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 mov eax, dword ptr [rsp+0x130] neg eax mov r10d, dword ptr [rsp+0x110+8*rax] mov r11d, dword ptr [rsp+0x120+8*rax] mov dword ptr [rsp+0x110], r10d mov dword ptr [rsp+0x120], r11d add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] movd xmm14, dword ptr [rsp+0x120] punpckldq xmm13, xmm14 mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl rax, 32 or rax, 64 movq xmm12, rax movdqa xmm3, xmm13 punpcklqdq xmm3, xmm12 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b -.p2align 6 -zfs_blake3_compress_in_place_sse2: +ENTRY_ALIGN(zfs_blake3_compress_in_place_sse2, 64) ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl r8, 32 add rdx, r8 movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rdi], xmm0 movups xmmword ptr [rdi+0x10], xmm1 RET -.p2align 6 -zfs_blake3_compress_xof_sse2: +ENTRY_ALIGN(zfs_blake3_compress_xof_sse2, 64) ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshuflw xmm3, xmm3, 0xB1 pshufhw xmm3, xmm3, 0xB1 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 movdqa xmm14, xmm3 psrld xmm3, 8 pslld xmm14, 24 pxor xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] por xmm9, xmm8 movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 movdqa xmm10, xmm6 pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] por xmm8, xmm10 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rdi] movdqu xmm5, xmmword ptr [rdi+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r9], xmm0 movups xmmword ptr [r9+0x10], xmm1 movups xmmword ptr [r9+0x20], xmm2 movups xmmword ptr [r9+0x30], xmm3 RET -.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2 -.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2 -.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2 +SET_SIZE(zfs_blake3_hash_many_sse2) +SET_SIZE(zfs_blake3_compress_in_place_sse2) +SET_SIZE(zfs_blake3_compress_xof_sse2) -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif +SECTION_STATIC .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A -ADD0: +ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 PBLENDW_0x33_MASK: .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 PBLENDW_0xCC_MASK: .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF PBLENDW_0x3F_MASK: .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 PBLENDW_0xC0_MASK: .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF #endif /* HAVE_SSE2 */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/blake3/blake3_sse41.S b/module/icp/asm-x86_64/blake3/blake3_sse41.S index 45b90cc9ed89..0b1c8b6a6251 100644 --- a/module/icp/asm-x86_64/blake3/blake3_sse41.S +++ b/module/icp/asm-x86_64/blake3/blake3_sse41.S @@ -1,2048 +1,2038 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2020 Samuel Neves * Copyright (c) 2022 Tino Reichardt */ #if defined(HAVE_SSE4_1) #define _ASM #include .intel_syntax noprefix -.global zfs_blake3_compress_in_place_sse41 -.global zfs_blake3_compress_xof_sse41 -.global zfs_blake3_hash_many_sse41 .text -.type zfs_blake3_hash_many_sse41,@function -.type zfs_blake3_compress_in_place_sse41,@function -.type zfs_blake3_compress_xof_sse41,@function -.p2align 6 -zfs_blake3_hash_many_sse41: +ENTRY_ALIGN(zfs_blake3_hash_many_sse41, 64) ENDBR push r15 push r14 push r13 push r12 push rbx push rbp mov rbp, rsp sub rsp, 360 and rsp, 0xFFFFFFFFFFFFFFC0 neg r9d movd xmm0, r9d pshufd xmm0, xmm0, 0x00 movdqa xmmword ptr [rsp+0x130], xmm0 movdqa xmm1, xmm0 pand xmm1, xmmword ptr [ADD0+rip] pand xmm0, xmmword ptr [ADD1+rip] movdqa xmmword ptr [rsp+0x150], xmm0 movd xmm0, r8d pshufd xmm0, xmm0, 0x00 paddd xmm0, xmm1 movdqa xmmword ptr [rsp+0x110], xmm0 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm1, xmm0 shr r8, 32 movd xmm2, r8d pshufd xmm2, xmm2, 0x00 psubd xmm2, xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 mov rbx, qword ptr [rbp+0x50] mov r15, rdx shl r15, 6 movzx r13d, byte ptr [rbp+0x38] movzx r12d, byte ptr [rbp+0x48] cmp rsi, 4 jc 3f 2: movdqu xmm3, xmmword ptr [rcx] pshufd xmm0, xmm3, 0x00 pshufd xmm1, xmm3, 0x55 pshufd xmm2, xmm3, 0xAA pshufd xmm3, xmm3, 0xFF movdqu xmm7, xmmword ptr [rcx+0x10] pshufd xmm4, xmm7, 0x00 pshufd xmm5, xmm7, 0x55 pshufd xmm6, xmm7, 0xAA pshufd xmm7, xmm7, 0xFF mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] mov r10, qword ptr [rdi+0x10] mov r11, qword ptr [rdi+0x18] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 9: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movdqu xmm8, xmmword ptr [r8+rdx-0x40] movdqu xmm9, xmmword ptr [r9+rdx-0x40] movdqu xmm10, xmmword ptr [r10+rdx-0x40] movdqu xmm11, xmmword ptr [r11+rdx-0x40] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp], xmm8 movdqa xmmword ptr [rsp+0x10], xmm9 movdqa xmmword ptr [rsp+0x20], xmm12 movdqa xmmword ptr [rsp+0x30], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x30] movdqu xmm9, xmmword ptr [r9+rdx-0x30] movdqu xmm10, xmmword ptr [r10+rdx-0x30] movdqu xmm11, xmmword ptr [r11+rdx-0x30] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x40], xmm8 movdqa xmmword ptr [rsp+0x50], xmm9 movdqa xmmword ptr [rsp+0x60], xmm12 movdqa xmmword ptr [rsp+0x70], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x20] movdqu xmm9, xmmword ptr [r9+rdx-0x20] movdqu xmm10, xmmword ptr [r10+rdx-0x20] movdqu xmm11, xmmword ptr [r11+rdx-0x20] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0x80], xmm8 movdqa xmmword ptr [rsp+0x90], xmm9 movdqa xmmword ptr [rsp+0xA0], xmm12 movdqa xmmword ptr [rsp+0xB0], xmm13 movdqu xmm8, xmmword ptr [r8+rdx-0x10] movdqu xmm9, xmmword ptr [r9+rdx-0x10] movdqu xmm10, xmmword ptr [r10+rdx-0x10] movdqu xmm11, xmmword ptr [r11+rdx-0x10] movdqa xmm12, xmm8 punpckldq xmm8, xmm9 punpckhdq xmm12, xmm9 movdqa xmm14, xmm10 punpckldq xmm10, xmm11 punpckhdq xmm14, xmm11 movdqa xmm9, xmm8 punpcklqdq xmm8, xmm10 punpckhqdq xmm9, xmm10 movdqa xmm13, xmm12 punpcklqdq xmm12, xmm14 punpckhqdq xmm13, xmm14 movdqa xmmword ptr [rsp+0xC0], xmm8 movdqa xmmword ptr [rsp+0xD0], xmm9 movdqa xmmword ptr [rsp+0xE0], xmm12 movdqa xmmword ptr [rsp+0xF0], xmm13 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] movdqa xmm12, xmmword ptr [rsp+0x110] movdqa xmm13, xmmword ptr [rsp+0x120] movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] movd xmm15, eax pshufd xmm15, xmm15, 0x00 prefetcht0 [r8+rdx+0x80] prefetcht0 [r9+rdx+0x80] prefetcht0 [r10+rdx+0x80] prefetcht0 [r11+rdx+0x80] paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x80] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x70] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x10] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0xD0] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x60] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xB0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x50] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0xE0] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x40] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x50] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xC0] paddd xmm1, xmmword ptr [rsp+0x90] paddd xmm2, xmmword ptr [rsp+0xF0] paddd xmm3, xmmword ptr [rsp+0xE0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0xA0] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0x70] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x20] paddd xmm1, xmmword ptr [rsp+0x30] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x90] paddd xmm1, xmmword ptr [rsp+0xB0] paddd xmm2, xmmword ptr [rsp+0x80] paddd xmm3, xmmword ptr [rsp+0xF0] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0xC0] paddd xmm3, xmmword ptr [rsp+0x10] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xD0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x20] paddd xmm3, xmmword ptr [rsp+0x40] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0x30] paddd xmm1, xmmword ptr [rsp+0xA0] paddd xmm2, xmmword ptr [rsp+0x60] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xB0] paddd xmm1, xmmword ptr [rsp+0x50] paddd xmm2, xmmword ptr [rsp+0x10] paddd xmm3, xmmword ptr [rsp+0x80] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xF0] paddd xmm1, xmmword ptr [rsp] paddd xmm2, xmmword ptr [rsp+0x90] paddd xmm3, xmmword ptr [rsp+0x60] paddd xmm0, xmm4 paddd xmm1, xmm5 paddd xmm2, xmm6 paddd xmm3, xmm7 pxor xmm12, xmm0 pxor xmm13, xmm1 pxor xmm14, xmm2 pxor xmm15, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 pshufb xmm15, xmm8 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm12 paddd xmm9, xmm13 paddd xmm10, xmm14 paddd xmm11, xmm15 pxor xmm4, xmm8 pxor xmm5, xmm9 pxor xmm6, xmm10 pxor xmm7, xmm11 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 paddd xmm0, xmmword ptr [rsp+0xE0] paddd xmm1, xmmword ptr [rsp+0x20] paddd xmm2, xmmword ptr [rsp+0x30] paddd xmm3, xmmword ptr [rsp+0x70] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT16+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 movdqa xmmword ptr [rsp+0x100], xmm8 movdqa xmm8, xmm5 psrld xmm8, 12 pslld xmm5, 20 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 12 pslld xmm6, 20 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 12 pslld xmm7, 20 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 12 pslld xmm4, 20 por xmm4, xmm8 paddd xmm0, xmmword ptr [rsp+0xA0] paddd xmm1, xmmword ptr [rsp+0xC0] paddd xmm2, xmmword ptr [rsp+0x40] paddd xmm3, xmmword ptr [rsp+0xD0] paddd xmm0, xmm5 paddd xmm1, xmm6 paddd xmm2, xmm7 paddd xmm3, xmm4 pxor xmm15, xmm0 pxor xmm12, xmm1 pxor xmm13, xmm2 pxor xmm14, xmm3 movdqa xmm8, xmmword ptr [ROT8+rip] pshufb xmm15, xmm8 pshufb xmm12, xmm8 pshufb xmm13, xmm8 pshufb xmm14, xmm8 paddd xmm10, xmm15 paddd xmm11, xmm12 movdqa xmm8, xmmword ptr [rsp+0x100] paddd xmm8, xmm13 paddd xmm9, xmm14 pxor xmm5, xmm10 pxor xmm6, xmm11 pxor xmm7, xmm8 pxor xmm4, xmm9 pxor xmm0, xmm8 pxor xmm1, xmm9 pxor xmm2, xmm10 pxor xmm3, xmm11 movdqa xmm8, xmm5 psrld xmm8, 7 pslld xmm5, 25 por xmm5, xmm8 movdqa xmm8, xmm6 psrld xmm8, 7 pslld xmm6, 25 por xmm6, xmm8 movdqa xmm8, xmm7 psrld xmm8, 7 pslld xmm7, 25 por xmm7, xmm8 movdqa xmm8, xmm4 psrld xmm8, 7 pslld xmm4, 25 por xmm4, xmm8 pxor xmm4, xmm12 pxor xmm5, xmm13 pxor xmm6, xmm14 pxor xmm7, xmm15 mov eax, r13d jne 9b movdqa xmm9, xmm0 punpckldq xmm0, xmm1 punpckhdq xmm9, xmm1 movdqa xmm11, xmm2 punpckldq xmm2, xmm3 punpckhdq xmm11, xmm3 movdqa xmm1, xmm0 punpcklqdq xmm0, xmm2 punpckhqdq xmm1, xmm2 movdqa xmm3, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm3, xmm11 movdqu xmmword ptr [rbx], xmm0 movdqu xmmword ptr [rbx+0x20], xmm1 movdqu xmmword ptr [rbx+0x40], xmm9 movdqu xmmword ptr [rbx+0x60], xmm3 movdqa xmm9, xmm4 punpckldq xmm4, xmm5 punpckhdq xmm9, xmm5 movdqa xmm11, xmm6 punpckldq xmm6, xmm7 punpckhdq xmm11, xmm7 movdqa xmm5, xmm4 punpcklqdq xmm4, xmm6 punpckhqdq xmm5, xmm6 movdqa xmm7, xmm9 punpcklqdq xmm9, xmm11 punpckhqdq xmm7, xmm11 movdqu xmmword ptr [rbx+0x10], xmm4 movdqu xmmword ptr [rbx+0x30], xmm5 movdqu xmmword ptr [rbx+0x50], xmm9 movdqu xmmword ptr [rbx+0x70], xmm7 movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm0, xmm1 paddd xmm1, xmmword ptr [rsp+0x150] movdqa xmmword ptr [rsp+0x110], xmm1 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] pcmpgtd xmm0, xmm1 movdqa xmm1, xmmword ptr [rsp+0x120] psubd xmm1, xmm0 movdqa xmmword ptr [rsp+0x120], xmm1 add rbx, 128 add rdi, 32 sub rsi, 4 cmp rsi, 4 jnc 2b test rsi, rsi jnz 3f 4: mov rsp, rbp pop rbp pop rbx pop r12 pop r13 pop r14 pop r15 RET .p2align 5 3: test esi, 0x2 je 3f movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movaps xmm8, xmm0 movaps xmm9, xmm1 movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp], xmm13 movd xmm14, dword ptr [rsp+0x114] pinsrd xmm14, dword ptr [rsp+0x124], 1 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmmword ptr [rsp+0x10], xmm14 mov r8, qword ptr [rdi] mov r9, qword ptr [rdi+0x8] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm10, xmm2 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm3, xmm4 shufps xmm4, xmm5, 136 shufps xmm3, xmm5, 221 movaps xmm5, xmm3 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm3, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm3, xmm7, 221 pshufd xmm7, xmm3, 0x93 movups xmm12, xmmword ptr [r9+rdx-0x40] movups xmm13, xmmword ptr [r9+rdx-0x30] movaps xmm11, xmm12 shufps xmm12, xmm13, 136 shufps xmm11, xmm13, 221 movaps xmm13, xmm11 movups xmm14, xmmword ptr [r9+rdx-0x20] movups xmm15, xmmword ptr [r9+rdx-0x10] movaps xmm11, xmm14 shufps xmm14, xmm15, 136 pshufd xmm14, xmm14, 0x93 shufps xmm11, xmm15, 221 pshufd xmm15, xmm11, 0x93 movaps xmm3, xmmword ptr [rsp] movaps xmm11, xmmword ptr [rsp+0x10] pinsrd xmm3, eax, 3 pinsrd xmm11, eax, 3 mov al, 7 9: paddd xmm0, xmm4 paddd xmm8, xmm12 movaps xmmword ptr [rsp+0x20], xmm4 movaps xmmword ptr [rsp+0x30], xmm12 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm12, xmmword ptr [ROT16+rip] pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm5 paddd xmm8, xmm13 movaps xmmword ptr [rsp+0x40], xmm5 movaps xmmword ptr [rsp+0x50], xmm13 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 movaps xmm13, xmmword ptr [ROT8+rip] pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x93 pshufd xmm8, xmm8, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x39 pshufd xmm10, xmm10, 0x39 paddd xmm0, xmm6 paddd xmm8, xmm14 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm12 pshufb xmm11, xmm12 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 20 psrld xmm4, 12 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 20 psrld xmm4, 12 por xmm9, xmm4 paddd xmm0, xmm7 paddd xmm8, xmm15 paddd xmm0, xmm1 paddd xmm8, xmm9 pxor xmm3, xmm0 pxor xmm11, xmm8 pshufb xmm3, xmm13 pshufb xmm11, xmm13 paddd xmm2, xmm3 paddd xmm10, xmm11 pxor xmm1, xmm2 pxor xmm9, xmm10 movdqa xmm4, xmm1 pslld xmm1, 25 psrld xmm4, 7 por xmm1, xmm4 movdqa xmm4, xmm9 pslld xmm9, 25 psrld xmm4, 7 por xmm9, xmm4 pshufd xmm0, xmm0, 0x39 pshufd xmm8, xmm8, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm11, xmm11, 0x4E pshufd xmm2, xmm2, 0x93 pshufd xmm10, xmm10, 0x93 dec al je 9f movdqa xmm12, xmmword ptr [rsp+0x20] movdqa xmm5, xmmword ptr [rsp+0x40] pshufd xmm13, xmm12, 0x0F shufps xmm12, xmm5, 214 pshufd xmm4, xmm12, 0x39 movdqa xmm12, xmm6 shufps xmm12, xmm7, 250 pblendw xmm13, xmm12, 0xCC movdqa xmm12, xmm7 punpcklqdq xmm12, xmm5 pblendw xmm12, xmm6, 0xC0 pshufd xmm12, xmm12, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmmword ptr [rsp+0x20], xmm13 movdqa xmmword ptr [rsp+0x40], xmm12 movdqa xmm5, xmmword ptr [rsp+0x30] movdqa xmm13, xmmword ptr [rsp+0x50] pshufd xmm6, xmm5, 0x0F shufps xmm5, xmm13, 214 pshufd xmm12, xmm5, 0x39 movdqa xmm5, xmm14 shufps xmm5, xmm15, 250 pblendw xmm6, xmm5, 0xCC movdqa xmm5, xmm15 punpcklqdq xmm5, xmm13 pblendw xmm5, xmm14, 0xC0 pshufd xmm5, xmm5, 0x78 punpckhdq xmm13, xmm15 punpckldq xmm14, xmm13 pshufd xmm15, xmm14, 0x1E movdqa xmm13, xmm6 movdqa xmm14, xmm5 movdqa xmm5, xmmword ptr [rsp+0x20] movdqa xmm6, xmmword ptr [rsp+0x40] jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm8, xmm10 pxor xmm9, xmm11 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 movups xmmword ptr [rbx+0x20], xmm8 movups xmmword ptr [rbx+0x30], xmm9 movdqa xmm0, xmmword ptr [rsp+0x130] movdqa xmm1, xmmword ptr [rsp+0x110] movdqa xmm2, xmmword ptr [rsp+0x120] movdqu xmm3, xmmword ptr [rsp+0x118] movdqu xmm4, xmmword ptr [rsp+0x128] blendvps xmm1, xmm3, xmm0 blendvps xmm2, xmm4, xmm0 movdqa xmmword ptr [rsp+0x110], xmm1 movdqa xmmword ptr [rsp+0x120], xmm2 add rdi, 16 add rbx, 64 sub rsi, 2 3: test esi, 0x1 je 4b movups xmm0, xmmword ptr [rcx] movups xmm1, xmmword ptr [rcx+0x10] movd xmm13, dword ptr [rsp+0x110] pinsrd xmm13, dword ptr [rsp+0x120], 1 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov r8, qword ptr [rdi] movzx eax, byte ptr [rbp+0x40] or eax, r13d xor edx, edx 2: mov r14d, eax or eax, r12d add rdx, 64 cmp rdx, r15 cmovne eax, r14d movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movaps xmm3, xmm13 pinsrd xmm3, eax, 3 movups xmm4, xmmword ptr [r8+rdx-0x40] movups xmm5, xmmword ptr [r8+rdx-0x30] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [r8+rdx-0x20] movups xmm7, xmmword ptr [r8+rdx-0x10] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 mov eax, r13d cmp rdx, r15 jne 2b movups xmmword ptr [rbx], xmm0 movups xmmword ptr [rbx+0x10], xmm1 jmp 4b -.p2align 6 -zfs_blake3_compress_in_place_sse41: + +ENTRY_ALIGN(zfs_blake3_compress_in_place_sse41, 64) ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] shl r8, 32 add rdx, r8 movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: pxor xmm0, xmm2 pxor xmm1, xmm3 movups xmmword ptr [rdi], xmm0 movups xmmword ptr [rdi+0x10], xmm1 RET -.p2align 6 -zfs_blake3_compress_xof_sse41: + +ENTRY_ALIGN(zfs_blake3_compress_xof_sse41, 64) ENDBR movups xmm0, xmmword ptr [rdi] movups xmm1, xmmword ptr [rdi+0x10] movaps xmm2, xmmword ptr [BLAKE3_IV+rip] movzx eax, r8b movzx edx, dl shl rax, 32 add rdx, rax movq xmm3, rcx movq xmm4, rdx punpcklqdq xmm3, xmm4 movups xmm4, xmmword ptr [rsi] movups xmm5, xmmword ptr [rsi+0x10] movaps xmm8, xmm4 shufps xmm4, xmm5, 136 shufps xmm8, xmm5, 221 movaps xmm5, xmm8 movups xmm6, xmmword ptr [rsi+0x20] movups xmm7, xmmword ptr [rsi+0x30] movaps xmm8, xmm6 shufps xmm6, xmm7, 136 pshufd xmm6, xmm6, 0x93 shufps xmm8, xmm7, 221 pshufd xmm7, xmm8, 0x93 movaps xmm14, xmmword ptr [ROT8+rip] movaps xmm15, xmmword ptr [ROT16+rip] mov al, 7 9: paddd xmm0, xmm4 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm5 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x93 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x39 paddd xmm0, xmm6 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm15 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 20 psrld xmm11, 12 por xmm1, xmm11 paddd xmm0, xmm7 paddd xmm0, xmm1 pxor xmm3, xmm0 pshufb xmm3, xmm14 paddd xmm2, xmm3 pxor xmm1, xmm2 movdqa xmm11, xmm1 pslld xmm1, 25 psrld xmm11, 7 por xmm1, xmm11 pshufd xmm0, xmm0, 0x39 pshufd xmm3, xmm3, 0x4E pshufd xmm2, xmm2, 0x93 dec al jz 9f movdqa xmm8, xmm4 shufps xmm8, xmm5, 214 pshufd xmm9, xmm4, 0x0F pshufd xmm4, xmm8, 0x39 movdqa xmm8, xmm6 shufps xmm8, xmm7, 250 pblendw xmm9, xmm8, 0xCC movdqa xmm8, xmm7 punpcklqdq xmm8, xmm5 pblendw xmm8, xmm6, 0xC0 pshufd xmm8, xmm8, 0x78 punpckhdq xmm5, xmm7 punpckldq xmm6, xmm5 pshufd xmm7, xmm6, 0x1E movdqa xmm5, xmm9 movdqa xmm6, xmm8 jmp 9b 9: movdqu xmm4, xmmword ptr [rdi] movdqu xmm5, xmmword ptr [rdi+0x10] pxor xmm0, xmm2 pxor xmm1, xmm3 pxor xmm2, xmm4 pxor xmm3, xmm5 movups xmmword ptr [r9], xmm0 movups xmmword ptr [r9+0x10], xmm1 movups xmmword ptr [r9+0x20], xmm2 movups xmmword ptr [r9+0x30], xmm3 RET -.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41 -.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41 -.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41 +SET_SIZE(zfs_blake3_hash_many_sse41) +SET_SIZE(zfs_blake3_compress_in_place_sse41) +SET_SIZE(zfs_blake3_compress_xof_sse41) + +SECTION_STATIC -#ifdef __APPLE__ -.static_data -#else -.section .rodata -#endif .p2align 6 BLAKE3_IV: .long 0x6A09E667, 0xBB67AE85 .long 0x3C6EF372, 0xA54FF53A ROT16: .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 ROT8: .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 -ADD0: +ADD0: .long 0, 1, 2, 3 ADD1: .long 4, 4, 4, 4 BLAKE3_IV_0: .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 BLAKE3_IV_1: .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 BLAKE3_IV_2: .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 BLAKE3_IV_3: .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A BLAKE3_BLOCK_LEN: .long 64, 64, 64, 64 CMP_MSB_MASK: .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 #endif /* HAVE_SSE4_1 */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index cf17b3768712..75dd2c721f56 100644 --- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -1,1271 +1,1259 @@ # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # # AES-NI-CTR+GHASH stitch. # # February 2013 # # OpenSSL GCM implementation is organized in such way that its # performance is rather close to the sum of its streamed components, # in the context parallelized AES-NI CTR and modulo-scheduled # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation # was observed to perform significantly better than the sum of the # components on contemporary CPUs, the effort was deemed impossible to # justify. This module is based on combination of Intel submissions, # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles # pressure with notable relative improvement, achieving 1.0 cycle per # byte processed with 128-bit key on Haswell processor, 0.74 - on # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled # measurements for favourable packet size, one divisible by 96. # Applications using the EVP interface will observe a few percent # worse performance.] # # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf # Generated once from # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl # and modified for ICP. Modification are kept at a bare minimum to ease later # upstream merges. #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) #define _ASM #include +/* Windows userland links with OpenSSL */ +#if !defined (_WIN32) || defined (_KERNEL) + .extern gcm_avx_can_use_movbe .text #ifdef HAVE_MOVBE -.type _aesni_ctr32_ghash_6x,@function -.align 32 -_aesni_ctr32_ghash_6x: +.align 32 +FUNCTION(_aesni_ctr32_ghash_6x) .cfi_startproc ENDBR vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x .align 32 .Loop6x: addl $100663296,%ebx jc .Lhandle_ctr32 vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movbeq 88(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 80(%r14),%r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movbeq 72(%r14),%r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movbeq 64(%r14),%r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movbeq 56(%r14),%r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movbeq 48(%r14),%r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movbeq 40(%r14),%r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movbeq 32(%r14),%r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movbeq 24(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 16(%r14),%r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movbeq 8(%r14),%r13 vaesenc %xmm1,%xmm13,%xmm13 movbeq 0(%r14),%r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 cmpl $14,%ebp // ICP does not zero key schedule. jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail .align 32 .Lhandle_ctr32: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32 .align 32 .Lenc_tail: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%r10 subq $0x6,%rdx jc .L6x_done vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x .L6x_done: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 RET .cfi_endproc -.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +SET_SIZE(_aesni_ctr32_ghash_6x) #endif /* ifdef HAVE_MOVBE */ -.type _aesni_ctr32_ghash_no_movbe_6x,@function -.align 32 -_aesni_ctr32_ghash_no_movbe_6x: +.align 32 +FUNCTION(_aesni_ctr32_ghash_no_movbe_6x) .cfi_startproc ENDBR vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x_nmb .align 32 .Loop6x_nmb: addl $100663296,%ebx jc .Lhandle_ctr32_nmb vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32_nmb: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movq 88(%r14),%r13 bswapq %r13 vaesenc %xmm15,%xmm11,%xmm11 movq 80(%r14),%r12 bswapq %r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movq 72(%r14),%r13 bswapq %r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movq 64(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movq 56(%r14),%r13 bswapq %r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movq 48(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movq 40(%r14),%r13 bswapq %r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movq 32(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movq 24(%r14),%r13 bswapq %r13 vaesenc %xmm15,%xmm11,%xmm11 movq 16(%r14),%r12 bswapq %r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movq 8(%r14),%r13 bswapq %r13 vaesenc %xmm1,%xmm13,%xmm13 movq 0(%r14),%r12 bswapq %r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. jb .Lenc_tail_nmb vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 cmpl $14,%ebp // ICP does not zero key schedule. jb .Lenc_tail_nmb vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail_nmb .align 32 .Lhandle_ctr32_nmb: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32_nmb .align 32 .Lenc_tail_nmb: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%r10 subq $0x6,%rdx jc .L6x_done_nmb vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x_nmb .L6x_done_nmb: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 RET .cfi_endproc -.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x +SET_SIZE(_aesni_ctr32_ghash_no_movbe_6x) -.globl aesni_gcm_decrypt -.type aesni_gcm_decrypt,@function -.align 32 -aesni_gcm_decrypt: +ENTRY_ALIGN(aesni_gcm_decrypt, 32) .cfi_startproc ENDBR xorq %r10,%r10 cmpq $0x60,%rdx jb .Lgcm_dec_abort leaq (%rsp),%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 pushq %r9 .cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 vmovdqu (%r9),%xmm8 andq $-128,%rsp vmovdqu (%r11),%xmm0 leaq 128(%rcx),%rcx movq 32(%r9),%r9 leaq 32(%r9),%r9 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. vpshufb %xmm0,%xmm8,%xmm8 andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Ldec_no_key_aliasing cmpq $768,%r15 jnc .Ldec_no_key_aliasing subq %r15,%rsp .Ldec_no_key_aliasing: vmovdqu 80(%rdi),%xmm7 leaq (%rdi),%r14 vmovdqu 64(%rdi),%xmm4 leaq -192(%rdi,%rdx,1),%r15 vmovdqu 48(%rdi),%xmm5 shrq $4,%rdx xorq %r10,%r10 vmovdqu 32(%rdi),%xmm6 vpshufb %xmm0,%xmm7,%xmm7 vmovdqu 16(%rdi),%xmm2 vpshufb %xmm0,%xmm4,%xmm4 vmovdqu (%rdi),%xmm3 vpshufb %xmm0,%xmm5,%xmm5 vmovdqu %xmm4,48(%rsp) vpshufb %xmm0,%xmm6,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm2,%xmm2 vmovdqu %xmm6,80(%rsp) vpshufb %xmm0,%xmm3,%xmm3 vmovdqu %xmm2,96(%rsp) vmovdqu %xmm3,112(%rsp) #ifdef HAVE_MOVBE #ifdef _KERNEL testl $1,gcm_avx_can_use_movbe(%rip) #else testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) #endif jz 1f call _aesni_ctr32_ghash_6x jmp 2f 1: #endif call _aesni_ctr32_ghash_no_movbe_6x 2: vmovups %xmm9,-96(%rsi) vmovups %xmm10,-80(%rsi) vmovups %xmm11,-64(%rsi) vmovups %xmm12,-48(%rsi) vmovups %xmm13,-32(%rsi) vmovups %xmm14,-16(%rsi) vpshufb (%r11),%xmm8,%xmm8 movq -56(%rax),%r9 .cfi_restore %r9 vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lgcm_dec_abort: movq %r10,%rax RET .cfi_endproc -.size aesni_gcm_decrypt,.-aesni_gcm_decrypt -.type _aesni_ctr32_6x,@function -.align 32 -_aesni_ctr32_6x: +SET_SIZE(aesni_gcm_decrypt) + +.align 32 +FUNCTION(_aesni_ctr32_6x) .cfi_startproc ENDBR vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. vmovups 16-128(%rcx),%xmm15 leaq 32-128(%rcx),%r12 vpxor %xmm4,%xmm1,%xmm9 addl $100663296,%ebx jc .Lhandle_ctr32_2 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddb %xmm2,%xmm11,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddb %xmm2,%xmm12,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .align 16 .Loop_ctr32: vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vmovups (%r12),%xmm15 leaq 16(%r12),%r12 decl %r13d jnz .Loop_ctr32 vmovdqu (%r12),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor 0(%rdi),%xmm3,%xmm4 vaesenc %xmm15,%xmm10,%xmm10 vpxor 16(%rdi),%xmm3,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 vpxor 32(%rdi),%xmm3,%xmm6 vaesenc %xmm15,%xmm12,%xmm12 vpxor 48(%rdi),%xmm3,%xmm8 vaesenc %xmm15,%xmm13,%xmm13 vpxor 64(%rdi),%xmm3,%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vpxor 80(%rdi),%xmm3,%xmm3 leaq 96(%rdi),%rdi vaesenclast %xmm4,%xmm9,%xmm9 vaesenclast %xmm5,%xmm10,%xmm10 vaesenclast %xmm6,%xmm11,%xmm11 vaesenclast %xmm8,%xmm12,%xmm12 vaesenclast %xmm2,%xmm13,%xmm13 vaesenclast %xmm3,%xmm14,%xmm14 vmovups %xmm9,0(%rsi) vmovups %xmm10,16(%rsi) vmovups %xmm11,32(%rsi) vmovups %xmm12,48(%rsi) vmovups %xmm13,64(%rsi) vmovups %xmm14,80(%rsi) leaq 96(%rsi),%rsi RET .align 32 .Lhandle_ctr32_2: vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpshufb %xmm0,%xmm14,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpshufb %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .cfi_endproc -.size _aesni_ctr32_6x,.-_aesni_ctr32_6x +SET_SIZE(_aesni_ctr32_6x) -.globl aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -.align 32 -aesni_gcm_encrypt: +ENTRY_ALIGN(aesni_gcm_encrypt, 32) .cfi_startproc ENDBR xorq %r10,%r10 cmpq $288,%rdx jb .Lgcm_enc_abort leaq (%rsp),%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 pushq %r9 .cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 leaq 128(%rcx),%rcx vmovdqu (%r11),%xmm0 andq $-128,%rsp movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Lenc_no_key_aliasing cmpq $768,%r15 jnc .Lenc_no_key_aliasing subq %r15,%rsp .Lenc_no_key_aliasing: leaq (%rsi),%r14 leaq -192(%rsi,%rdx,1),%r15 shrq $4,%rdx call _aesni_ctr32_6x vpshufb %xmm0,%xmm9,%xmm8 vpshufb %xmm0,%xmm10,%xmm2 vmovdqu %xmm8,112(%rsp) vpshufb %xmm0,%xmm11,%xmm4 vmovdqu %xmm2,96(%rsp) vpshufb %xmm0,%xmm12,%xmm5 vmovdqu %xmm4,80(%rsp) vpshufb %xmm0,%xmm13,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm14,%xmm7 vmovdqu %xmm6,48(%rsp) call _aesni_ctr32_6x vmovdqu (%r9),%xmm8 movq 32(%r9),%r9 leaq 32(%r9),%r9 subq $12,%rdx movq $192,%r10 vpshufb %xmm0,%xmm8,%xmm8 #ifdef HAVE_MOVBE #ifdef _KERNEL testl $1,gcm_avx_can_use_movbe(%rip) #else testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) #endif jz 1f call _aesni_ctr32_ghash_6x jmp 2f 1: #endif call _aesni_ctr32_ghash_no_movbe_6x 2: vmovdqu 32(%rsp),%xmm7 vmovdqu (%r11),%xmm0 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm7,%xmm7,%xmm1 vmovdqu 32-32(%r9),%xmm15 vmovups %xmm9,-96(%rsi) vpshufb %xmm0,%xmm9,%xmm9 vpxor %xmm7,%xmm1,%xmm1 vmovups %xmm10,-80(%rsi) vpshufb %xmm0,%xmm10,%xmm10 vmovups %xmm11,-64(%rsi) vpshufb %xmm0,%xmm11,%xmm11 vmovups %xmm12,-48(%rsi) vpshufb %xmm0,%xmm12,%xmm12 vmovups %xmm13,-32(%rsi) vpshufb %xmm0,%xmm13,%xmm13 vmovups %xmm14,-16(%rsi) vpshufb %xmm0,%xmm14,%xmm14 vmovdqu %xmm9,16(%rsp) vmovdqu 48(%rsp),%xmm6 vmovdqu 16-32(%r9),%xmm0 vpunpckhqdq %xmm6,%xmm6,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 vpxor %xmm6,%xmm2,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vmovdqu 64(%rsp),%xmm9 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm9,%xmm9,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 vpxor %xmm9,%xmm5,%xmm5 vpxor %xmm7,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vmovdqu 80(%rsp),%xmm1 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm4,%xmm7,%xmm7 vpunpckhqdq %xmm1,%xmm1,%xmm4 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpxor %xmm6,%xmm9,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 96(%rsp),%xmm2 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm7,%xmm6,%xmm6 vpunpckhqdq %xmm2,%xmm2,%xmm7 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 vpxor %xmm2,%xmm7,%xmm7 vpxor %xmm9,%xmm1,%xmm1 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm5,%xmm4,%xmm4 vpxor 112(%rsp),%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 vmovdqu 112-32(%r9),%xmm0 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpxor %xmm6,%xmm5,%xmm5 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 vpxor %xmm4,%xmm7,%xmm4 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm1 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 vpxor %xmm14,%xmm1,%xmm1 vpxor %xmm5,%xmm6,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 vmovdqu 32-32(%r9),%xmm15 vpxor %xmm2,%xmm8,%xmm7 vpxor %xmm4,%xmm9,%xmm6 vmovdqu 16-32(%r9),%xmm0 vpxor %xmm5,%xmm7,%xmm9 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 vpxor %xmm9,%xmm6,%xmm6 vpunpckhqdq %xmm13,%xmm13,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 vpxor %xmm13,%xmm2,%xmm2 vpslldq $8,%xmm6,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vpxor %xmm9,%xmm5,%xmm8 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm6,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm12,%xmm12,%xmm9 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 vpxor %xmm12,%xmm9,%xmm9 vpxor %xmm14,%xmm13,%xmm13 vpalignr $8,%xmm8,%xmm8,%xmm14 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm11,%xmm11,%xmm1 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm13,%xmm12,%xmm12 vxorps 16(%rsp),%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm9,%xmm9 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm10,%xmm10,%xmm2 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 vpxor %xmm10,%xmm2,%xmm2 vpalignr $8,%xmm8,%xmm8,%xmm14 vpxor %xmm12,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm9,%xmm1,%xmm1 vxorps %xmm7,%xmm14,%xmm14 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 vmovdqu 112-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm11,%xmm10,%xmm10 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 vpxor %xmm4,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 vpxor %xmm10,%xmm7,%xmm7 vpxor %xmm2,%xmm6,%xmm6 vpxor %xmm5,%xmm7,%xmm4 vpxor %xmm4,%xmm6,%xmm6 vpslldq $8,%xmm6,%xmm1 vmovdqu 16(%r11),%xmm3 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm1,%xmm5,%xmm8 vpxor %xmm6,%xmm7,%xmm7 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm2,%xmm8,%xmm8 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm7,%xmm2,%xmm2 vpxor %xmm2,%xmm8,%xmm8 vpshufb (%r11),%xmm8,%xmm8 movq -56(%rax),%r9 .cfi_restore %r9 vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lgcm_enc_abort: movq %r10,%rax RET .cfi_endproc -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +SET_SIZE(aesni_gcm_encrypt) + +#endif /* !_WIN32 || _KERNEL */ /* Some utility routines */ /* * clear all fpu registers * void clear_fpu_regs_avx(void); */ -.globl clear_fpu_regs_avx -.type clear_fpu_regs_avx,@function -.align 32 -clear_fpu_regs_avx: +ENTRY_ALIGN(clear_fpu_regs_avx, 32) vzeroall RET -.size clear_fpu_regs_avx,.-clear_fpu_regs_avx +SET_SIZE(clear_fpu_regs_avx) /* * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); * * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and * stores the result at `dst'. The XOR is performed using FPU registers, * so make sure FPU state is saved when running this in the kernel. */ -.globl gcm_xor_avx -.type gcm_xor_avx,@function -.align 32 -gcm_xor_avx: +ENTRY_ALIGN(gcm_xor_avx, 32) movdqu (%rdi), %xmm0 movdqu (%rsi), %xmm1 pxor %xmm1, %xmm0 movdqu %xmm0, (%rsi) RET -.size gcm_xor_avx,.-gcm_xor_avx +SET_SIZE(gcm_xor_avx) /* * Toggle a boolean_t value atomically and return the new value. * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); */ -.globl atomic_toggle_boolean_nv -.type atomic_toggle_boolean_nv,@function -.align 32 -atomic_toggle_boolean_nv: +ENTRY_ALIGN(atomic_toggle_boolean_nv, 32) xorl %eax, %eax lock xorl $1, (%rdi) jz 1f movl $1, %eax 1: RET -.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv +SET_SIZE(atomic_toggle_boolean_nv) + +SECTION_STATIC -.pushsection .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lpoly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .Lone_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Ltwo_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 -.popsection /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S index bf3724a23eae..d48b4f2155cc 100644 --- a/module/icp/asm-x86_64/modes/ghash-x86_64.S +++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S @@ -1,724 +1,720 @@ # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # March, June 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH # function features so called "528B" variant utilizing additional # 256+16 bytes of per-key storage [+512 bytes shared table]. # Performance results are for this streamed GHASH subroutine and are # expressed in cycles per processed byte, less is better: # # gcc 3.4.x(*) assembler # # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% # Atom 31.6 16.8 +88% # VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results # are for "528B";-) # (**) it's mystery [to me] why Core2 result is not same as for # Opteron; # May 2010 # # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. # See ghash-x86.pl for background information and details about coding # techniques. # # Special thanks to David Woodhouse for providing access to a # Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # # Overhaul: aggregate Karatsuba post-processing, improve ILP in # reduction_alg9, increase reduction aggregate factor to 4x. As for # the latter. ghash-x86.pl discusses that it makes lesser sense to # increase aggregate factor. Then why increase here? Critical path # consists of 3 independent pclmulqdq instructions, Karatsuba post- # processing and reduction. "On top" of this we lay down aggregated # multiplication operations, triplets of independent pclmulqdq's. As # issue rate for pclmulqdq is limited, it makes lesser sense to # aggregate more multiplications than it takes to perform remaining # non-multiplication operations. 2x is near-optimal coefficient for # contemporary Intel CPUs (therefore modest improvement coefficient), # but not for Bulldozer. Latter is because logical SIMD operations # are twice as slow in comparison to Intel, so that critical path is # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # # Westmere 1.78(+13%) # Sandy Bridge 1.80(+8%) # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Broadwell 0.45(+110%)(if system doesn't support AVX) # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) # Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 # # ... 8x aggregate factor AVX code path is using reduction algorithm # suggested by Shay Gueron[1]. Even though contemporary AVX-capable # CPUs such as Sandy and Ivy Bridge can execute it, the code performs # sub-optimally in comparison to above mentioned version. But thanks # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # # Knights Landing achieves 1.09 cpb. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # Generated once from # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl # and modified for ICP. Modification are kept at a bare minimum to ease later # upstream merges. #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) #define _ASM #include .text -.globl gcm_gmult_clmul -.type gcm_gmult_clmul,@function -.align 16 -gcm_gmult_clmul: +/* Windows userland links with OpenSSL */ +#if !defined (_WIN32) || defined (_KERNEL) +ENTRY_ALIGN(gcm_gmult_clmul, 16) + .cfi_startproc ENDBR + .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) RET .cfi_endproc -.size gcm_gmult_clmul,.-gcm_gmult_clmul +SET_SIZE(gcm_gmult_clmul) +#endif /* !_WIN32 || _KERNEL */ -.globl gcm_init_htab_avx -.type gcm_init_htab_avx,@function -.align 32 -gcm_init_htab_avx: +ENTRY_ALIGN(gcm_init_htab_avx, 32) .cfi_startproc ENDBR vzeroupper vmovdqu (%rsi),%xmm2 // KCF/ICP stores H in network byte order with the hi qword first // so we need to swap all bytes, not the 2 qwords. vmovdqu .Lbswap_mask(%rip),%xmm4 vpshufb %xmm4,%xmm2,%xmm2 vpshufd $255,%xmm2,%xmm4 vpsrlq $63,%xmm2,%xmm3 vpsllq $1,%xmm2,%xmm2 vpxor %xmm5,%xmm5,%xmm5 vpcmpgtd %xmm4,%xmm5,%xmm5 vpslldq $8,%xmm3,%xmm3 vpor %xmm3,%xmm2,%xmm2 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm2,%xmm2,%xmm6 vmovdqa %xmm2,%xmm0 vpxor %xmm2,%xmm6,%xmm6 movq $4,%r10 jmp .Linit_start_avx .align 32 .Linit_loop_avx: vpalignr $8,%xmm3,%xmm4,%xmm5 vmovdqu %xmm5,-16(%rdi) vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 .Linit_start_avx: vmovdqa %xmm0,%xmm5 vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 vpshufd $78,%xmm5,%xmm3 vpshufd $78,%xmm0,%xmm4 vpxor %xmm5,%xmm3,%xmm3 vmovdqu %xmm5,0(%rdi) vpxor %xmm0,%xmm4,%xmm4 vmovdqu %xmm0,16(%rdi) leaq 48(%rdi),%rdi subq $1,%r10 jnz .Linit_loop_avx vpalignr $8,%xmm4,%xmm3,%xmm5 vmovdqu %xmm5,-16(%rdi) vzeroupper RET .cfi_endproc -.size gcm_init_htab_avx,.-gcm_init_htab_avx +SET_SIZE(gcm_init_htab_avx) -.globl gcm_gmult_avx -.type gcm_gmult_avx,@function -.align 32 -gcm_gmult_avx: +#if !defined (_WIN32) || defined (_KERNEL) +ENTRY_ALIGN(gcm_gmult_avx, 32) .cfi_startproc ENDBR jmp .L_gmult_clmul .cfi_endproc -.size gcm_gmult_avx,.-gcm_gmult_avx -.globl gcm_ghash_avx -.type gcm_ghash_avx,@function -.align 32 -gcm_ghash_avx: +SET_SIZE(gcm_gmult_avx) + +ENTRY_ALIGN(gcm_ghash_avx, 32) .cfi_startproc ENDBR vzeroupper vmovdqu (%rdi),%xmm10 leaq .L0x1c2_polynomial(%rip),%r10 leaq 64(%rsi),%rsi vmovdqu .Lbswap_mask(%rip),%xmm13 vpshufb %xmm13,%xmm10,%xmm10 cmpq $0x80,%rcx jb .Lshort_avx subq $0x80,%rcx vmovdqu 112(%rdx),%xmm14 vmovdqu 0-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vmovdqu 32-64(%rsi),%xmm7 vpunpckhqdq %xmm14,%xmm14,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm14,%xmm9,%xmm9 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 80(%rdx),%xmm14 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 48-64(%rsi),%xmm6 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 64(%rdx),%xmm15 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 48(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 32(%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 16(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu (%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 leaq 128(%rdx),%rdx cmpq $0x80,%rcx jb .Ltail_avx vpxor %xmm10,%xmm15,%xmm15 subq $0x80,%rcx jmp .Loop8x_avx .align 32 .Loop8x_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 112(%rdx),%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpxor %xmm15,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 vmovdqu 0-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 vmovdqu 32-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm3,%xmm10,%xmm10 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vxorps %xmm4,%xmm11,%xmm11 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm5,%xmm12,%xmm12 vxorps %xmm15,%xmm8,%xmm8 vmovdqu 80(%rdx),%xmm14 vpxor %xmm10,%xmm12,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm11,%xmm12,%xmm12 vpslldq $8,%xmm12,%xmm9 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vpsrldq $8,%xmm12,%xmm12 vpxor %xmm9,%xmm10,%xmm10 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vxorps %xmm12,%xmm11,%xmm11 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 64(%rdx),%xmm15 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vxorps %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vmovdqu 48(%rdx),%xmm14 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 32(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vxorps %xmm12,%xmm10,%xmm10 vmovdqu 16(%rdx),%xmm14 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vxorps %xmm11,%xmm12,%xmm12 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu (%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm12,%xmm15,%xmm15 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 vpxor %xmm10,%xmm15,%xmm15 leaq 128(%rdx),%rdx subq $0x80,%rcx jnc .Loop8x_avx addq $0x80,%rcx jmp .Ltail_no_xor_avx .align 32 .Lshort_avx: vmovdqu -16(%rdx,%rcx,1),%xmm14 leaq (%rdx,%rcx,1),%rdx vmovdqu 0-64(%rsi),%xmm6 vmovdqu 32-64(%rsi),%xmm7 vpshufb %xmm13,%xmm14,%xmm15 vmovdqa %xmm0,%xmm3 vmovdqa %xmm1,%xmm4 vmovdqa %xmm2,%xmm5 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -32(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -48(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 80-64(%rsi),%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -64(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -80(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 96-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 128-64(%rsi),%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -96(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -112(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 144-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovq 184-64(%rsi),%xmm7 subq $0x10,%rcx jmp .Ltail_avx .align 32 .Ltail_avx: vpxor %xmm10,%xmm15,%xmm15 .Ltail_no_xor_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu (%r10),%xmm12 vpxor %xmm0,%xmm3,%xmm10 vpxor %xmm1,%xmm4,%xmm11 vpxor %xmm2,%xmm5,%xmm5 vpxor %xmm10,%xmm5,%xmm5 vpxor %xmm11,%xmm5,%xmm5 vpslldq $8,%xmm5,%xmm9 vpsrldq $8,%xmm5,%xmm5 vpxor %xmm9,%xmm10,%xmm10 vpxor %xmm5,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm11,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 cmpq $0,%rcx jne .Lshort_avx vpshufb %xmm13,%xmm10,%xmm10 vmovdqu %xmm10,(%rdi) vzeroupper RET .cfi_endproc -.size gcm_ghash_avx,.-gcm_ghash_avx +SET_SIZE(gcm_ghash_avx) + +#endif /* !_WIN32 || _KERNEL */ -.pushsection .rodata +SECTION_STATIC .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .L7_mask: .long 7,0,7,0 .L7_mask_poly: .long 7,0,450,0 .align 64 -.type .Lrem_4bit,@object +SET_OBJ(.Lrem_4bit) .Lrem_4bit: .long 0,0,0,471859200,0,943718400,0,610271232 .long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 .long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 .long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 -.type .Lrem_8bit,@object +SET_OBJ(.Lrem_8bit) .Lrem_8bit: .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 -.popsection /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S index 60d34b4a3be0..321d5da461db 100644 --- a/module/icp/asm-x86_64/sha2/sha256_impl.S +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -1,2090 +1,2090 @@ /* * ==================================================================== * Written by Andy Polyakov for the OpenSSL * project. Rights for redistribution and usage in source and binary * forms are granted according to the OpenSSL license. * ==================================================================== * * sha256/512_block procedure for x86_64. * * 40% improvement over compiler-generated code on Opteron. On EM64T * sha256 was observed to run >80% faster and sha512 - >40%. No magical * tricks, just straight implementation... I really wonder why gcc * [being armed with inline assembler] fails to generate as fast code. * The only thing which is cool about this module is that it's very * same instruction sequence used for both SHA-256 and SHA-512. In * former case the instructions operate on 32-bit operands, while in * latter - on 64-bit ones. All I had to do is to get one flavor right, * the other one passed the test right away:-) * * sha256_block runs in ~1005 cycles on Opteron, which gives you * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock * frequency in GHz. sha512_block runs in ~1275 cycles, which results * in 128*1000/1275=100MBps per GHz. Is there room for improvement? * Well, if you compare it to IA-64 implementation, which maintains * X[16] in register bank[!], tends to 4 instructions per CPU clock * cycle and runs in 1003 cycles, 1275 is very good result for 3-way * issue Opteron pipeline and X[16] maintained in memory. So that *if* * there is a way to improve it, *then* the only way would be to try to * offload X[16] updates to SSE unit, but that would require "deeper" * loop unroll, which in turn would naturally cause size blow-up, not * to mention increased complexity! And once again, only *if* it's * actually possible to noticeably improve overall ILP, instruction * level parallelism, on a given CPU implementation in this case. * * Special note on Intel EM64T. While Opteron CPU exhibits perfect * performance ratio of 1.5 between 64- and 32-bit flavors [see above], * [currently available] EM64T CPUs apparently are far from it. On the * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit * sha256_block:-( This is presumably because 64-bit shifts/rotates * apparently are not atomic instructions, but implemented in microcode. */ /* * OpenSolaris OS modifications * * Sun elects to use this software under the BSD license. * * This source originates from OpenSSL file sha512-x86_64.pl at * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz * (presumably for future OpenSSL release 0.9.8h), with these changes: * * 1. Added perl "use strict" and declared variables. * * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. * * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) * assemblers). Replaced the .picmeup macro with assembler code. * * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", * at the beginning of SHA2_CTX (the next field is 8-byte aligned). */ /* * This file was generated by a perl script (sha512-x86_64.pl) that were * used to generate sha256 and sha512 variants from the same code base. * The comments from the original file have been pasted above. */ #if defined(lint) || defined(__lint) #include #include void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) { (void) ctx, (void) in, (void) num; } #else #define _ASM #include ENTRY_NP(SHA256TransformBlocks) .cfi_startproc ENDBR movq %rsp, %rax .cfi_def_cfa_register %rax push %rbx .cfi_offset %rbx,-16 push %rbp .cfi_offset %rbp,-24 push %r12 .cfi_offset %r12,-32 push %r13 .cfi_offset %r13,-40 push %r14 .cfi_offset %r14,-48 push %r15 .cfi_offset %r15,-56 mov %rsp,%rbp # copy %rsp shl $4,%rdx # num*16 sub $16*4+4*8,%rsp lea (%rsi,%rdx,4),%rdx # inp+num*16*4 and $-64,%rsp # align stack frame add $8,%rdi # Skip OpenSolaris field, "algotype" mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg mov %rbp,16*4+3*8(%rsp) # save copy of %rsp # echo ".cfi_cfa_expression %rsp+88,deref,+56" | # openssl/crypto/perlasm/x86_64-xlate.pl .cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x38 #.picmeup %rbp # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts # the address of the "next" instruction into the target register # (%rbp). This generates these 2 instructions: lea .Llea(%rip),%rbp #nop # .picmeup generates a nop for mod 8 alignment--not needed here .Llea: lea K256-.(%rbp),%rbp mov 4*0(%rdi),%eax mov 4*1(%rdi),%ebx mov 4*2(%rdi),%ecx mov 4*3(%rdi),%edx mov 4*4(%rdi),%r8d mov 4*5(%rdi),%r9d mov 4*6(%rdi),%r10d mov 4*7(%rdi),%r11d jmp .Lloop .align 16 .Lloop: xor %rdi,%rdi mov 4*0(%rsi),%r12d bswap %r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,0(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 4*1(%rsi),%r12d bswap %r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,4(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 4*2(%rsi),%r12d bswap %r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,8(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 4*3(%rsi),%r12d bswap %r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,12(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 4*4(%rsi),%r12d bswap %r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,16(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 4*5(%rsi),%r12d bswap %r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,20(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 4*6(%rsi),%r12d bswap %r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,24(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 4*7(%rsi),%r12d bswap %r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,28(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) mov 4*8(%rsi),%r12d bswap %r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,32(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 4*9(%rsi),%r12d bswap %r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,36(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 4*10(%rsi),%r12d bswap %r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,40(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 4*11(%rsi),%r12d bswap %r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,44(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 4*12(%rsi),%r12d bswap %r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,48(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 4*13(%rsi),%r12d bswap %r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,52(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 4*14(%rsi),%r12d bswap %r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,56(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 4*15(%rsi),%r12d bswap %r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,60(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: mov 4(%rsp),%r13d mov 56(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 36(%rsp),%r12d add 0(%rsp),%r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,0(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 8(%rsp),%r13d mov 60(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 40(%rsp),%r12d add 4(%rsp),%r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,4(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 12(%rsp),%r13d mov 0(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 44(%rsp),%r12d add 8(%rsp),%r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,8(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 16(%rsp),%r13d mov 4(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 48(%rsp),%r12d add 12(%rsp),%r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,12(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 20(%rsp),%r13d mov 8(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 52(%rsp),%r12d add 16(%rsp),%r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,16(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 24(%rsp),%r13d mov 12(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 56(%rsp),%r12d add 20(%rsp),%r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,20(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 28(%rsp),%r13d mov 16(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 60(%rsp),%r12d add 24(%rsp),%r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,24(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 32(%rsp),%r13d mov 20(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 0(%rsp),%r12d add 28(%rsp),%r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,28(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) mov 36(%rsp),%r13d mov 24(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 4(%rsp),%r12d add 32(%rsp),%r12d mov %r8d,%r13d mov %r8d,%r14d mov %r9d,%r15d ror $6,%r13d ror $11,%r14d xor %r10d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r8d,%r15d # (f^g)&e mov %r12d,32(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r11d,%r12d # T1+=h mov %eax,%r11d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %eax,%r13d mov %eax,%r14d ror $2,%r11d ror $13,%r13d mov %eax,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r11d ror $9,%r13d or %ecx,%r14d # a|c xor %r13d,%r11d # h=Sigma0(a) and %ecx,%r15d # a&c add %r12d,%edx # d+=T1 and %ebx,%r14d # (a|c)&b add %r12d,%r11d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r11d # h+=Maj(a,b,c) mov 40(%rsp),%r13d mov 28(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 8(%rsp),%r12d add 36(%rsp),%r12d mov %edx,%r13d mov %edx,%r14d mov %r8d,%r15d ror $6,%r13d ror $11,%r14d xor %r9d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %edx,%r15d # (f^g)&e mov %r12d,36(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r10d,%r12d # T1+=h mov %r11d,%r10d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r11d,%r13d mov %r11d,%r14d ror $2,%r10d ror $13,%r13d mov %r11d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r10d ror $9,%r13d or %ebx,%r14d # a|c xor %r13d,%r10d # h=Sigma0(a) and %ebx,%r15d # a&c add %r12d,%ecx # d+=T1 and %eax,%r14d # (a|c)&b add %r12d,%r10d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r10d # h+=Maj(a,b,c) mov 44(%rsp),%r13d mov 32(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 12(%rsp),%r12d add 40(%rsp),%r12d mov %ecx,%r13d mov %ecx,%r14d mov %edx,%r15d ror $6,%r13d ror $11,%r14d xor %r8d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ecx,%r15d # (f^g)&e mov %r12d,40(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r9d,%r12d # T1+=h mov %r10d,%r9d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r10d,%r13d mov %r10d,%r14d ror $2,%r9d ror $13,%r13d mov %r10d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r9d ror $9,%r13d or %eax,%r14d # a|c xor %r13d,%r9d # h=Sigma0(a) and %eax,%r15d # a&c add %r12d,%ebx # d+=T1 and %r11d,%r14d # (a|c)&b add %r12d,%r9d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r9d # h+=Maj(a,b,c) mov 48(%rsp),%r13d mov 36(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 16(%rsp),%r12d add 44(%rsp),%r12d mov %ebx,%r13d mov %ebx,%r14d mov %ecx,%r15d ror $6,%r13d ror $11,%r14d xor %edx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %ebx,%r15d # (f^g)&e mov %r12d,44(%rsp) xor %r14d,%r13d # Sigma1(e) xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %r8d,%r12d # T1+=h mov %r9d,%r8d add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r9d,%r13d mov %r9d,%r14d ror $2,%r8d ror $13,%r13d mov %r9d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%r8d ror $9,%r13d or %r11d,%r14d # a|c xor %r13d,%r8d # h=Sigma0(a) and %r11d,%r15d # a&c add %r12d,%eax # d+=T1 and %r10d,%r14d # (a|c)&b add %r12d,%r8d # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%r8d # h+=Maj(a,b,c) mov 52(%rsp),%r13d mov 40(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 20(%rsp),%r12d add 48(%rsp),%r12d mov %eax,%r13d mov %eax,%r14d mov %ebx,%r15d ror $6,%r13d ror $11,%r14d xor %ecx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %eax,%r15d # (f^g)&e mov %r12d,48(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %edx,%r12d # T1+=h mov %r8d,%edx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %r8d,%r13d mov %r8d,%r14d ror $2,%edx ror $13,%r13d mov %r8d,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%edx ror $9,%r13d or %r10d,%r14d # a|c xor %r13d,%edx # h=Sigma0(a) and %r10d,%r15d # a&c add %r12d,%r11d # d+=T1 and %r9d,%r14d # (a|c)&b add %r12d,%edx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%edx # h+=Maj(a,b,c) mov 56(%rsp),%r13d mov 44(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 24(%rsp),%r12d add 52(%rsp),%r12d mov %r11d,%r13d mov %r11d,%r14d mov %eax,%r15d ror $6,%r13d ror $11,%r14d xor %ebx,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r11d,%r15d # (f^g)&e mov %r12d,52(%rsp) xor %r14d,%r13d # Sigma1(e) xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ecx,%r12d # T1+=h mov %edx,%ecx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %edx,%r13d mov %edx,%r14d ror $2,%ecx ror $13,%r13d mov %edx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ecx ror $9,%r13d or %r9d,%r14d # a|c xor %r13d,%ecx # h=Sigma0(a) and %r9d,%r15d # a&c add %r12d,%r10d # d+=T1 and %r8d,%r14d # (a|c)&b add %r12d,%ecx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ecx # h+=Maj(a,b,c) mov 60(%rsp),%r13d mov 48(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 28(%rsp),%r12d add 56(%rsp),%r12d mov %r10d,%r13d mov %r10d,%r14d mov %r11d,%r15d ror $6,%r13d ror $11,%r14d xor %eax,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r10d,%r15d # (f^g)&e mov %r12d,56(%rsp) xor %r14d,%r13d # Sigma1(e) xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g add %ebx,%r12d # T1+=h mov %ecx,%ebx add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ecx,%r13d mov %ecx,%r14d ror $2,%ebx ror $13,%r13d mov %ecx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%ebx ror $9,%r13d or %r8d,%r14d # a|c xor %r13d,%ebx # h=Sigma0(a) and %r8d,%r15d # a&c add %r12d,%r9d # d+=T1 and %edx,%r14d # (a|c)&b add %r12d,%ebx # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%ebx # h+=Maj(a,b,c) mov 0(%rsp),%r13d mov 52(%rsp),%r12d mov %r13d,%r15d shr $3,%r13d ror $7,%r15d xor %r15d,%r13d ror $11,%r15d xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) mov %r12d,%r14d shr $10,%r12d ror $17,%r14d xor %r14d,%r12d ror $2,%r14d xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) add %r13d,%r12d add 32(%rsp),%r12d add 60(%rsp),%r12d mov %r9d,%r13d mov %r9d,%r14d mov %r10d,%r15d ror $6,%r13d ror $11,%r14d xor %r11d,%r15d # f^g xor %r14d,%r13d ror $14,%r14d and %r9d,%r15d # (f^g)&e mov %r12d,60(%rsp) xor %r14d,%r13d # Sigma1(e) xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g add %eax,%r12d # T1+=h mov %ebx,%eax add %r13d,%r12d # T1+=Sigma1(e) add %r15d,%r12d # T1+=Ch(e,f,g) mov %ebx,%r13d mov %ebx,%r14d ror $2,%eax ror $13,%r13d mov %ebx,%r15d add (%rbp,%rdi,4),%r12d # T1+=K[round] xor %r13d,%eax ror $9,%r13d or %edx,%r14d # a|c xor %r13d,%eax # h=Sigma0(a) and %edx,%r15d # a&c add %r12d,%r8d # d+=T1 and %ecx,%r14d # (a|c)&b add %r12d,%eax # h+=T1 or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14d,%eax # h+=Maj(a,b,c) cmp $64,%rdi jb .Lrounds_16_xx mov 16*4+0*8(%rsp),%rdi lea 16*4(%rsi),%rsi add 4*0(%rdi),%eax add 4*1(%rdi),%ebx add 4*2(%rdi),%ecx add 4*3(%rdi),%edx add 4*4(%rdi),%r8d add 4*5(%rdi),%r9d add 4*6(%rdi),%r10d add 4*7(%rdi),%r11d cmp 16*4+2*8(%rsp),%rsi mov %eax,4*0(%rdi) mov %ebx,4*1(%rdi) mov %ecx,4*2(%rdi) mov %edx,4*3(%rdi) mov %r8d,4*4(%rdi) mov %r9d,4*5(%rdi) mov %r10d,4*6(%rdi) mov %r11d,4*7(%rdi) jb .Lloop mov 16*4+3*8(%rsp),%rsp .cfi_def_cfa %rsp,56 pop %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 pop %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 pop %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 pop %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 pop %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp pop %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx RET .cfi_endproc SET_SIZE(SHA256TransformBlocks) .section .rodata .align 64 -.type K256,@object +SET_OBJ(K256) K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 #endif /* !lint && !__lint */ #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S index ed7fb362a1ac..180f8e366060 100644 --- a/module/icp/asm-x86_64/sha2/sha512_impl.S +++ b/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -1,2115 +1,2116 @@ /* * ==================================================================== * Written by Andy Polyakov for the OpenSSL * project. Rights for redistribution and usage in source and binary * forms are granted according to the OpenSSL license. * ==================================================================== * * sha256/512_block procedure for x86_64. * * 40% improvement over compiler-generated code on Opteron. On EM64T * sha256 was observed to run >80% faster and sha512 - >40%. No magical * tricks, just straight implementation... I really wonder why gcc * [being armed with inline assembler] fails to generate as fast code. * The only thing which is cool about this module is that it's very * same instruction sequence used for both SHA-256 and SHA-512. In * former case the instructions operate on 32-bit operands, while in * latter - on 64-bit ones. All I had to do is to get one flavor right, * the other one passed the test right away:-) * * sha256_block runs in ~1005 cycles on Opteron, which gives you * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock * frequency in GHz. sha512_block runs in ~1275 cycles, which results * in 128*1000/1275=100MBps per GHz. Is there room for improvement? * Well, if you compare it to IA-64 implementation, which maintains * X[16] in register bank[!], tends to 4 instructions per CPU clock * cycle and runs in 1003 cycles, 1275 is very good result for 3-way * issue Opteron pipeline and X[16] maintained in memory. So that *if* * there is a way to improve it, *then* the only way would be to try to * offload X[16] updates to SSE unit, but that would require "deeper" * loop unroll, which in turn would naturally cause size blow-up, not * to mention increased complexity! And once again, only *if* it's * actually possible to noticeably improve overall ILP, instruction * level parallelism, on a given CPU implementation in this case. * * Special note on Intel EM64T. While Opteron CPU exhibits perfect * performance ratio of 1.5 between 64- and 32-bit flavors [see above], * [currently available] EM64T CPUs apparently are far from it. On the * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit * sha256_block:-( This is presumably because 64-bit shifts/rotates * apparently are not atomic instructions, but implemented in microcode. */ /* * OpenSolaris OS modifications * * Sun elects to use this software under the BSD license. * * This source originates from OpenSSL file sha512-x86_64.pl at * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz * (presumably for future OpenSSL release 0.9.8h), with these changes: * * 1. Added perl "use strict" and declared variables. * * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. * * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) * assemblers). Replaced the .picmeup macro with assembler code. * * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", * at the beginning of SHA2_CTX (the next field is 8-byte aligned). */ /* * This file was generated by a perl script (sha512-x86_64.pl) that were * used to generate sha256 and sha512 variants from the same code base. * The comments from the original file have been pasted above. */ #if defined(lint) || defined(__lint) #include #include void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) { (void) ctx, (void) in, (void) num; } #else #define _ASM #include ENTRY_NP(SHA512TransformBlocks) .cfi_startproc ENDBR movq %rsp, %rax .cfi_def_cfa_register %rax push %rbx .cfi_offset %rbx,-16 push %rbp .cfi_offset %rbp,-24 push %r12 .cfi_offset %r12,-32 push %r13 .cfi_offset %r13,-40 push %r14 .cfi_offset %r14,-48 push %r15 .cfi_offset %r15,-56 mov %rsp,%rbp # copy %rsp shl $4,%rdx # num*16 sub $16*8+4*8,%rsp lea (%rsi,%rdx,8),%rdx # inp+num*16*8 and $-64,%rsp # align stack frame add $8,%rdi # Skip OpenSolaris field, "algotype" mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg mov %rbp,16*8+3*8(%rsp) # save copy of %rsp # echo ".cfi_cfa_expression %rsp+152,deref,+56" | # openssl/crypto/perlasm/x86_64-xlate.pl .cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x38 #.picmeup %rbp # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts # the address of the "next" instruction into the target register # (%rbp). This generates these 2 instructions: lea .Llea(%rip),%rbp #nop # .picmeup generates a nop for mod 8 alignment--not needed here .Llea: lea K512-.(%rbp),%rbp mov 8*0(%rdi),%rax mov 8*1(%rdi),%rbx mov 8*2(%rdi),%rcx mov 8*3(%rdi),%rdx mov 8*4(%rdi),%r8 mov 8*5(%rdi),%r9 mov 8*6(%rdi),%r10 mov 8*7(%rdi),%r11 jmp .Lloop .align 16 .Lloop: xor %rdi,%rdi mov 8*0(%rsi),%r12 bswap %r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,0(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 8*1(%rsi),%r12 bswap %r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,8(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 8*2(%rsi),%r12 bswap %r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,16(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 8*3(%rsi),%r12 bswap %r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,24(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 8*4(%rsi),%r12 bswap %r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,32(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 8*5(%rsi),%r12 bswap %r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,40(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 8*6(%rsi),%r12 bswap %r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,48(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 8*7(%rsi),%r12 bswap %r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,56(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) mov 8*8(%rsi),%r12 bswap %r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,64(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 8*9(%rsi),%r12 bswap %r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,72(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 8*10(%rsi),%r12 bswap %r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,80(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 8*11(%rsi),%r12 bswap %r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,88(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 8*12(%rsi),%r12 bswap %r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,96(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 8*13(%rsi),%r12 bswap %r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,104(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 8*14(%rsi),%r12 bswap %r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,112(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 8*15(%rsi),%r12 bswap %r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,120(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: mov 8(%rsp),%r13 mov 112(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 72(%rsp),%r12 add 0(%rsp),%r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,0(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 16(%rsp),%r13 mov 120(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 80(%rsp),%r12 add 8(%rsp),%r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,8(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 24(%rsp),%r13 mov 0(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 88(%rsp),%r12 add 16(%rsp),%r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,16(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 32(%rsp),%r13 mov 8(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 96(%rsp),%r12 add 24(%rsp),%r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,24(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 40(%rsp),%r13 mov 16(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 104(%rsp),%r12 add 32(%rsp),%r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,32(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 48(%rsp),%r13 mov 24(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 112(%rsp),%r12 add 40(%rsp),%r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,40(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 56(%rsp),%r13 mov 32(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 120(%rsp),%r12 add 48(%rsp),%r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,48(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 64(%rsp),%r13 mov 40(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 0(%rsp),%r12 add 56(%rsp),%r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,56(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) mov 72(%rsp),%r13 mov 48(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 8(%rsp),%r12 add 64(%rsp),%r12 mov %r8,%r13 mov %r8,%r14 mov %r9,%r15 ror $14,%r13 ror $18,%r14 xor %r10,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r8,%r15 # (f^g)&e mov %r12,64(%rsp) xor %r14,%r13 # Sigma1(e) xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r11,%r12 # T1+=h mov %rax,%r11 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rax,%r13 mov %rax,%r14 ror $28,%r11 ror $34,%r13 mov %rax,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r11 ror $5,%r13 or %rcx,%r14 # a|c xor %r13,%r11 # h=Sigma0(a) and %rcx,%r15 # a&c add %r12,%rdx # d+=T1 and %rbx,%r14 # (a|c)&b add %r12,%r11 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r11 # h+=Maj(a,b,c) mov 80(%rsp),%r13 mov 56(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 16(%rsp),%r12 add 72(%rsp),%r12 mov %rdx,%r13 mov %rdx,%r14 mov %r8,%r15 ror $14,%r13 ror $18,%r14 xor %r9,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rdx,%r15 # (f^g)&e mov %r12,72(%rsp) xor %r14,%r13 # Sigma1(e) xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r10,%r12 # T1+=h mov %r11,%r10 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r11,%r13 mov %r11,%r14 ror $28,%r10 ror $34,%r13 mov %r11,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r10 ror $5,%r13 or %rbx,%r14 # a|c xor %r13,%r10 # h=Sigma0(a) and %rbx,%r15 # a&c add %r12,%rcx # d+=T1 and %rax,%r14 # (a|c)&b add %r12,%r10 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r10 # h+=Maj(a,b,c) mov 88(%rsp),%r13 mov 64(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 24(%rsp),%r12 add 80(%rsp),%r12 mov %rcx,%r13 mov %rcx,%r14 mov %rdx,%r15 ror $14,%r13 ror $18,%r14 xor %r8,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rcx,%r15 # (f^g)&e mov %r12,80(%rsp) xor %r14,%r13 # Sigma1(e) xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r9,%r12 # T1+=h mov %r10,%r9 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r10,%r13 mov %r10,%r14 ror $28,%r9 ror $34,%r13 mov %r10,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r9 ror $5,%r13 or %rax,%r14 # a|c xor %r13,%r9 # h=Sigma0(a) and %rax,%r15 # a&c add %r12,%rbx # d+=T1 and %r11,%r14 # (a|c)&b add %r12,%r9 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r9 # h+=Maj(a,b,c) mov 96(%rsp),%r13 mov 72(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 32(%rsp),%r12 add 88(%rsp),%r12 mov %rbx,%r13 mov %rbx,%r14 mov %rcx,%r15 ror $14,%r13 ror $18,%r14 xor %rdx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rbx,%r15 # (f^g)&e mov %r12,88(%rsp) xor %r14,%r13 # Sigma1(e) xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %r8,%r12 # T1+=h mov %r9,%r8 add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r9,%r13 mov %r9,%r14 ror $28,%r8 ror $34,%r13 mov %r9,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%r8 ror $5,%r13 or %r11,%r14 # a|c xor %r13,%r8 # h=Sigma0(a) and %r11,%r15 # a&c add %r12,%rax # d+=T1 and %r10,%r14 # (a|c)&b add %r12,%r8 # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%r8 # h+=Maj(a,b,c) mov 104(%rsp),%r13 mov 80(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 40(%rsp),%r12 add 96(%rsp),%r12 mov %rax,%r13 mov %rax,%r14 mov %rbx,%r15 ror $14,%r13 ror $18,%r14 xor %rcx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %rax,%r15 # (f^g)&e mov %r12,96(%rsp) xor %r14,%r13 # Sigma1(e) xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rdx,%r12 # T1+=h mov %r8,%rdx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %r8,%r13 mov %r8,%r14 ror $28,%rdx ror $34,%r13 mov %r8,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rdx ror $5,%r13 or %r10,%r14 # a|c xor %r13,%rdx # h=Sigma0(a) and %r10,%r15 # a&c add %r12,%r11 # d+=T1 and %r9,%r14 # (a|c)&b add %r12,%rdx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rdx # h+=Maj(a,b,c) mov 112(%rsp),%r13 mov 88(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 48(%rsp),%r12 add 104(%rsp),%r12 mov %r11,%r13 mov %r11,%r14 mov %rax,%r15 ror $14,%r13 ror $18,%r14 xor %rbx,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r11,%r15 # (f^g)&e mov %r12,104(%rsp) xor %r14,%r13 # Sigma1(e) xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rcx,%r12 # T1+=h mov %rdx,%rcx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rdx,%r13 mov %rdx,%r14 ror $28,%rcx ror $34,%r13 mov %rdx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rcx ror $5,%r13 or %r9,%r14 # a|c xor %r13,%rcx # h=Sigma0(a) and %r9,%r15 # a&c add %r12,%r10 # d+=T1 and %r8,%r14 # (a|c)&b add %r12,%rcx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rcx # h+=Maj(a,b,c) mov 120(%rsp),%r13 mov 96(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 56(%rsp),%r12 add 112(%rsp),%r12 mov %r10,%r13 mov %r10,%r14 mov %r11,%r15 ror $14,%r13 ror $18,%r14 xor %rax,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r10,%r15 # (f^g)&e mov %r12,112(%rsp) xor %r14,%r13 # Sigma1(e) xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rbx,%r12 # T1+=h mov %rcx,%rbx add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rcx,%r13 mov %rcx,%r14 ror $28,%rbx ror $34,%r13 mov %rcx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rbx ror $5,%r13 or %r8,%r14 # a|c xor %r13,%rbx # h=Sigma0(a) and %r8,%r15 # a&c add %r12,%r9 # d+=T1 and %rdx,%r14 # (a|c)&b add %r12,%rbx # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rbx # h+=Maj(a,b,c) mov 0(%rsp),%r13 mov 104(%rsp),%r12 mov %r13,%r15 shr $7,%r13 ror $1,%r15 xor %r15,%r13 ror $7,%r15 xor %r15,%r13 # sigma0(X[(i+1)&0xf]) mov %r12,%r14 shr $6,%r12 ror $19,%r14 xor %r14,%r12 ror $42,%r14 xor %r14,%r12 # sigma1(X[(i+14)&0xf]) add %r13,%r12 add 64(%rsp),%r12 add 120(%rsp),%r12 mov %r9,%r13 mov %r9,%r14 mov %r10,%r15 ror $14,%r13 ror $18,%r14 xor %r11,%r15 # f^g xor %r14,%r13 ror $23,%r14 and %r9,%r15 # (f^g)&e mov %r12,120(%rsp) xor %r14,%r13 # Sigma1(e) xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g add %rax,%r12 # T1+=h mov %rbx,%rax add %r13,%r12 # T1+=Sigma1(e) add %r15,%r12 # T1+=Ch(e,f,g) mov %rbx,%r13 mov %rbx,%r14 ror $28,%rax ror $34,%r13 mov %rbx,%r15 add (%rbp,%rdi,8),%r12 # T1+=K[round] xor %r13,%rax ror $5,%r13 or %rdx,%r14 # a|c xor %r13,%rax # h=Sigma0(a) and %rdx,%r15 # a&c add %r12,%r8 # d+=T1 and %rcx,%r14 # (a|c)&b add %r12,%rax # h+=T1 or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1(%rdi),%rdi # round++ add %r14,%rax # h+=Maj(a,b,c) cmp $80,%rdi jb .Lrounds_16_xx mov 16*8+0*8(%rsp),%rdi lea 16*8(%rsi),%rsi add 8*0(%rdi),%rax add 8*1(%rdi),%rbx add 8*2(%rdi),%rcx add 8*3(%rdi),%rdx add 8*4(%rdi),%r8 add 8*5(%rdi),%r9 add 8*6(%rdi),%r10 add 8*7(%rdi),%r11 cmp 16*8+2*8(%rsp),%rsi mov %rax,8*0(%rdi) mov %rbx,8*1(%rdi) mov %rcx,8*2(%rdi) mov %rdx,8*3(%rdi) mov %r8,8*4(%rdi) mov %r9,8*5(%rdi) mov %r10,8*6(%rdi) mov %r11,8*7(%rdi) jb .Lloop mov 16*8+3*8(%rsp),%rsp .cfi_def_cfa %rsp,56 pop %r15 .cfi_adjust_cfa_offset -8 .cfi_restore %r15 pop %r14 .cfi_adjust_cfa_offset -8 .cfi_restore %r14 pop %r13 .cfi_adjust_cfa_offset -8 .cfi_restore %r13 pop %r12 .cfi_adjust_cfa_offset -8 .cfi_restore %r12 pop %rbp .cfi_adjust_cfa_offset -8 .cfi_restore %rbp pop %rbx .cfi_adjust_cfa_offset -8 .cfi_restore %rbx RET .cfi_endproc SET_SIZE(SHA512TransformBlocks) .section .rodata .align 64 -.type K512,@object +SET_OBJ(K512) K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 #endif /* !lint && !__lint */ -#ifdef __ELF__ +#if defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif + diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h index fe5c23974682..66eb4a6c8fb6 100644 --- a/module/icp/include/aes/aes_impl.h +++ b/module/icp/include/aes/aes_impl.h @@ -1,220 +1,221 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _AES_IMPL_H #define _AES_IMPL_H /* * Common definitions used by AES. */ #ifdef __cplusplus extern "C" { #endif #include #include +#include /* Similar to sysmacros.h IS_P2ALIGNED, but checks two pointers: */ #define IS_P2ALIGNED2(v, w, a) \ ((((uintptr_t)(v) | (uintptr_t)(w)) & ((uintptr_t)(a) - 1)) == 0) #define AES_BLOCK_LEN 16 /* bytes */ /* Round constant length, in number of 32-bit elements: */ #define RC_LENGTH (5 * ((AES_BLOCK_LEN) / 4 - 2)) #define AES_COPY_BLOCK(src, dst) \ (dst)[0] = (src)[0]; \ (dst)[1] = (src)[1]; \ (dst)[2] = (src)[2]; \ (dst)[3] = (src)[3]; \ (dst)[4] = (src)[4]; \ (dst)[5] = (src)[5]; \ (dst)[6] = (src)[6]; \ (dst)[7] = (src)[7]; \ (dst)[8] = (src)[8]; \ (dst)[9] = (src)[9]; \ (dst)[10] = (src)[10]; \ (dst)[11] = (src)[11]; \ (dst)[12] = (src)[12]; \ (dst)[13] = (src)[13]; \ (dst)[14] = (src)[14]; \ (dst)[15] = (src)[15] #define AES_XOR_BLOCK(src, dst) \ (dst)[0] ^= (src)[0]; \ (dst)[1] ^= (src)[1]; \ (dst)[2] ^= (src)[2]; \ (dst)[3] ^= (src)[3]; \ (dst)[4] ^= (src)[4]; \ (dst)[5] ^= (src)[5]; \ (dst)[6] ^= (src)[6]; \ (dst)[7] ^= (src)[7]; \ (dst)[8] ^= (src)[8]; \ (dst)[9] ^= (src)[9]; \ (dst)[10] ^= (src)[10]; \ (dst)[11] ^= (src)[11]; \ (dst)[12] ^= (src)[12]; \ (dst)[13] ^= (src)[13]; \ (dst)[14] ^= (src)[14]; \ (dst)[15] ^= (src)[15] /* AES key size definitions */ #define AES_MINBITS 128 #define AES_MAXBITS 256 /* AES key schedule may be implemented with 32- or 64-bit elements: */ #define AES_32BIT_KS 32 #define AES_64BIT_KS 64 #define MAX_AES_NR 14 /* Maximum number of rounds */ #define MAX_AES_NB 4 /* Number of columns comprising a state */ typedef union { #ifdef sun4u uint64_t ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; #endif uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; } aes_ks_t; typedef struct aes_impl_ops aes_impl_ops_t; /* * The absolute offset of the encr_ks (0) and the nr (504) fields are hard * coded in aesni-gcm-x86_64, so please don't change (or adjust accordingly). */ typedef struct aes_key aes_key_t; struct aes_key { aes_ks_t encr_ks; /* encryption key schedule */ aes_ks_t decr_ks; /* decryption key schedule */ #ifdef __amd64 long double align128; /* Align fields above for Intel AES-NI */ #endif /* __amd64 */ const aes_impl_ops_t *ops; /* ops associated with this schedule */ int nr; /* number of rounds (10, 12, or 14) */ int type; /* key schedule size (32 or 64 bits) */ }; /* * Core AES functions. * ks and keysched are pointers to aes_key_t. * They are declared void* as they are intended to be opaque types. * Use function aes_alloc_keysched() to allocate memory for ks and keysched. */ extern void *aes_alloc_keysched(size_t *size, int kmflag); extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched); extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct); extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt); /* * AES mode functions. * The first 2 functions operate on 16-byte AES blocks. */ extern void aes_copy_block(uint8_t *in, uint8_t *out); extern void aes_xor_block(uint8_t *data, uint8_t *dst); /* Note: ctx is a pointer to aes_ctx_t defined in modes.h */ extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length, crypto_data_t *out); extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length, crypto_data_t *out); /* * The following definitions and declarations are only used by AES FIPS POST */ #ifdef _AES_IMPL typedef enum aes_mech_type { AES_ECB_MECH_INFO_TYPE, /* SUN_CKM_AES_ECB */ AES_CBC_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC */ AES_CBC_PAD_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC_PAD */ AES_CTR_MECH_INFO_TYPE, /* SUN_CKM_AES_CTR */ AES_CCM_MECH_INFO_TYPE, /* SUN_CKM_AES_CCM */ AES_GCM_MECH_INFO_TYPE, /* SUN_CKM_AES_GCM */ AES_GMAC_MECH_INFO_TYPE /* SUN_CKM_AES_GMAC */ } aes_mech_type_t; #endif /* _AES_IMPL */ /* * Methods used to define AES implementation * * @aes_gen_f Key generation * @aes_enc_f Function encrypts one block * @aes_dec_f Function decrypts one block * @aes_will_work_f Function tests whether method will function */ typedef void (*aes_generate_f)(aes_key_t *, const uint32_t *, int); typedef void (*aes_encrypt_f)(const uint32_t[], int, const uint32_t[4], uint32_t[4]); typedef void (*aes_decrypt_f)(const uint32_t[], int, const uint32_t[4], uint32_t[4]); typedef boolean_t (*aes_will_work_f)(void); #define AES_IMPL_NAME_MAX (16) struct aes_impl_ops { aes_generate_f generate; aes_encrypt_f encrypt; aes_decrypt_f decrypt; aes_will_work_f is_supported; boolean_t needs_byteswap; char name[AES_IMPL_NAME_MAX]; }; extern const aes_impl_ops_t aes_generic_impl; #if defined(__x86_64) extern const aes_impl_ops_t aes_x86_64_impl; /* These functions are used to execute amd64 instructions for AMD or Intel: */ -extern int rijndael_key_setup_enc_amd64(uint32_t rk[], +extern ASMABI int rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[], int keyBits); -extern int rijndael_key_setup_dec_amd64(uint32_t rk[], +extern ASMABI int rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[], int keyBits); -extern void aes_encrypt_amd64(const uint32_t rk[], int Nr, +extern ASMABI void aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], uint32_t ct[4]); -extern void aes_decrypt_amd64(const uint32_t rk[], int Nr, +extern ASMABI void aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], uint32_t pt[4]); #endif #if defined(__x86_64) && defined(HAVE_AES) extern const aes_impl_ops_t aes_aesni_impl; #endif /* * Initializes fastest implementation */ void aes_impl_init(void); /* * Returns optimal allowed AES implementation */ const struct aes_impl_ops *aes_impl_get_ops(void); #ifdef __cplusplus } #endif #endif /* _AES_IMPL_H */ diff --git a/module/icp/include/sys/ia32/stack.h b/module/icp/include/sys/ia32/stack.h deleted file mode 100644 index 9ed327b343a6..000000000000 --- a/module/icp/include/sys/ia32/stack.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _IA32_SYS_STACK_H -#define _IA32_SYS_STACK_H - -#if !defined(_ASM) - -#include - -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * In the x86 world, a stack frame looks like this: - * - * |--------------------------| - * 4n+8(%ebp) ->| argument word n | - * | ... | (Previous frame) - * 8(%ebp) ->| argument word 0 | - * |--------------------------|-------------------- - * 4(%ebp) ->| return address | - * |--------------------------| - * 0(%ebp) ->| previous %ebp (optional) | - * |--------------------------| - * -4(%ebp) ->| unspecified | (Current frame) - * | ... | - * 0(%esp) ->| variable size | - * |--------------------------| - */ - -/* - * Stack alignment macros. - */ - -#define STACK_ALIGN32 4 -#define STACK_ENTRY_ALIGN32 4 -#define STACK_BIAS32 0 -#define SA32(x) (((x)+(STACK_ALIGN32-1)) & ~(STACK_ALIGN32-1)) -#define STACK_RESERVE32 0 -#define MINFRAME32 0 - -#if defined(__amd64) - -/* - * In the amd64 world, a stack frame looks like this: - * - * |--------------------------| - * 8n+16(%rbp)->| argument word n | - * | ... | (Previous frame) - * 16(%rbp) ->| argument word 0 | - * |--------------------------|-------------------- - * 8(%rbp) ->| return address | - * |--------------------------| - * 0(%rbp) ->| previous %rbp | - * |--------------------------| - * -8(%rbp) ->| unspecified | (Current frame) - * | ... | - * 0(%rsp) ->| variable size | - * |--------------------------| - * -128(%rsp) ->| reserved for function | - * |--------------------------| - * - * The end of the input argument area must be aligned on a 16-byte - * boundary; i.e. (%rsp - 8) % 16 == 0 at function entry. - * - * The 128-byte location beyond %rsp is considered to be reserved for - * functions and is NOT modified by signal handlers. It can be used - * to store temporary data that is not needed across function calls. - */ - -/* - * Stack alignment macros. - */ - -#define STACK_ALIGN64 16 -#define STACK_ENTRY_ALIGN64 8 -#define STACK_BIAS64 0 -#define SA64(x) (((x)+(STACK_ALIGN64-1)) & ~(STACK_ALIGN64-1)) -#define STACK_RESERVE64 128 -#define MINFRAME64 0 - -#define STACK_ALIGN STACK_ALIGN64 -#define STACK_ENTRY_ALIGN STACK_ENTRY_ALIGN64 -#define STACK_BIAS STACK_BIAS64 -#define SA(x) SA64(x) -#define STACK_RESERVE STACK_RESERVE64 -#define MINFRAME MINFRAME64 - -#elif defined(__i386) - -#define STACK_ALIGN STACK_ALIGN32 -#define STACK_ENTRY_ALIGN STACK_ENTRY_ALIGN32 -#define STACK_BIAS STACK_BIAS32 -#define SA(x) SA32(x) -#define STACK_RESERVE STACK_RESERVE32 -#define MINFRAME MINFRAME32 - -#endif /* __i386 */ - -#if defined(_KERNEL) && !defined(_ASM) - -#if defined(ZFS_DEBUG) -#if STACK_ALIGN == 4 -#define ASSERT_STACK_ALIGNED() \ - { \ - uint32_t __tmp; \ - ASSERT((((uintptr_t)&__tmp) & (STACK_ALIGN - 1)) == 0); \ - } -#elif (STACK_ALIGN == 16) && (_LONG_DOUBLE_ALIGNMENT == 16) -#define ASSERT_STACK_ALIGNED() \ - { \ - long double __tmp; \ - ASSERT((((uintptr_t)&__tmp) & (STACK_ALIGN - 1)) == 0); \ - } -#endif -#else /* DEBUG */ -#define ASSERT_STACK_ALIGNED() -#endif /* DEBUG */ - -struct regs; - -void traceregs(struct regs *); -void traceback(caddr_t); - -#endif /* defined(_KERNEL) && !defined(_ASM) */ - -#define STACK_GROWTH_DOWN /* stacks grow from high to low addresses */ - -#ifdef __cplusplus -} -#endif - -#endif /* _IA32_SYS_STACK_H */ diff --git a/module/icp/include/sys/ia32/trap.h b/module/icp/include/sys/ia32/trap.h deleted file mode 100644 index 3d74266326b2..000000000000 --- a/module/icp/include/sys/ia32/trap.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ -/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ -/* All Rights Reserved */ - -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _IA32_SYS_TRAP_H -#define _IA32_SYS_TRAP_H - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Trap type values - */ - -#define T_ZERODIV 0x0 /* #de divide by 0 error */ -#define T_SGLSTP 0x1 /* #db single step */ -#define T_NMIFLT 0x2 /* NMI */ -#define T_BPTFLT 0x3 /* #bp breakpoint fault, INT3 insn */ -#define T_OVFLW 0x4 /* #of INTO overflow fault */ -#define T_BOUNDFLT 0x5 /* #br BOUND insn fault */ -#define T_ILLINST 0x6 /* #ud invalid opcode fault */ -#define T_NOEXTFLT 0x7 /* #nm device not available: x87 */ -#define T_DBLFLT 0x8 /* #df double fault */ -#define T_EXTOVRFLT 0x9 /* [not generated: 386 only] */ -#define T_TSSFLT 0xa /* #ts invalid TSS fault */ -#define T_SEGFLT 0xb /* #np segment not present fault */ -#define T_STKFLT 0xc /* #ss stack fault */ -#define T_GPFLT 0xd /* #gp general protection fault */ -#define T_PGFLT 0xe /* #pf page fault */ -#define T_EXTERRFLT 0x10 /* #mf x87 FPU error fault */ -#define T_ALIGNMENT 0x11 /* #ac alignment check error */ -#define T_MCE 0x12 /* #mc machine check exception */ -#define T_SIMDFPE 0x13 /* #xm SSE/SSE exception */ -#define T_DBGENTR 0x14 /* debugger entry */ -#define T_ENDPERR 0x21 /* emulated extension error flt */ -#define T_ENOEXTFLT 0x20 /* emulated ext not present */ -#define T_FASTTRAP 0xd2 /* fast system call */ -#define T_SYSCALLINT 0x91 /* general system call */ -#define T_DTRACE_RET 0x7f /* DTrace pid return */ -#define T_INT80 0x80 /* int80 handler for linux emulation */ -#define T_SOFTINT 0x50fd /* pseudo softint trap type */ - -/* - * Pseudo traps. - */ -#define T_INTERRUPT 0x100 -#define T_FAULT 0x200 -#define T_AST 0x400 -#define T_SYSCALL 0x180 - - -/* - * Values of error code on stack in case of page fault - */ - -#define PF_ERR_MASK 0x01 /* Mask for error bit */ -#define PF_ERR_PAGE 0x00 /* page not present */ -#define PF_ERR_PROT 0x01 /* protection error */ -#define PF_ERR_WRITE 0x02 /* fault caused by write (else read) */ -#define PF_ERR_USER 0x04 /* processor was in user mode */ - /* (else supervisor) */ -#define PF_ERR_EXEC 0x10 /* attempt to execute a No eXec page (AMD) */ - -/* - * Definitions for fast system call subfunctions - */ -#define T_FNULL 0 /* Null trap for testing */ -#define T_FGETFP 1 /* Get emulated FP context */ -#define T_FSETFP 2 /* Set emulated FP context */ -#define T_GETHRTIME 3 /* Get high resolution time */ -#define T_GETHRVTIME 4 /* Get high resolution virtual time */ -#define T_GETHRESTIME 5 /* Get high resolution time */ -#define T_GETLGRP 6 /* Get home lgrpid */ - -#define T_LASTFAST 6 /* Last valid subfunction */ - -#ifdef __cplusplus -} -#endif - -#endif /* _IA32_SYS_TRAP_H */ diff --git a/module/lua/ldo.c b/module/lua/ldo.c index 291bca044e7b..bf525588e260 100644 --- a/module/lua/ldo.c +++ b/module/lua/ldo.c @@ -1,760 +1,760 @@ /* ** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $ ** Stack and Call structure of Lua ** See Copyright Notice in lua.h */ #define ldo_c #define LUA_CORE #include +#include #include "lapi.h" #include "ldebug.h" #include "ldo.h" #include "lfunc.h" #include "lgc.h" #include "lmem.h" #include "lobject.h" #include "lopcodes.h" #include "lparser.h" #include "lstate.h" #include "lstring.h" #include "ltable.h" #include "ltm.h" #include "lvm.h" #include "lzio.h" - /* Return the number of bytes available on the stack. */ #if defined (_KERNEL) && defined(__linux__) #include static intptr_t stack_remaining(void) { intptr_t local; local = (intptr_t)&local - (intptr_t)current->stack; return local; } #elif defined (_KERNEL) && defined(__FreeBSD__) #include static intptr_t stack_remaining(void) { intptr_t local; local = (intptr_t)&local - (intptr_t)curthread->td_kstack; return local; } #else static intptr_t stack_remaining(void) { return INTPTR_MAX; } #endif /* ** {====================================================== ** Error-recovery functions ** ======================================================= */ /* ** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By ** default, Lua handles errors with exceptions when compiling as ** C++ code, with _longjmp/_setjmp when asked to use them, and with ** longjmp/setjmp otherwise. */ #if !defined(LUAI_THROW) #ifdef _KERNEL #ifdef __linux__ #if defined(__i386__) #define JMP_BUF_CNT 6 #elif defined(__x86_64__) #define JMP_BUF_CNT 8 #elif defined(__sparc__) && defined(__arch64__) #define JMP_BUF_CNT 6 #elif defined(__powerpc__) #define JMP_BUF_CNT 26 #elif defined(__aarch64__) #define JMP_BUF_CNT 64 #elif defined(__arm__) #define JMP_BUF_CNT 65 #elif defined(__mips__) #define JMP_BUF_CNT 12 #elif defined(__s390x__) #define JMP_BUF_CNT 18 #elif defined(__riscv) #define JMP_BUF_CNT 64 #else #define JMP_BUF_CNT 1 #endif typedef struct _label_t { long long unsigned val[JMP_BUF_CNT]; } label_t; -int setjmp(label_t *) __attribute__ ((__nothrow__)); -extern __attribute__((noreturn)) void longjmp(label_t *); +int ASMABI setjmp(label_t *) __attribute__ ((__nothrow__)); +extern __attribute__((noreturn)) void ASMABI longjmp(label_t *); #define LUAI_THROW(L,c) longjmp(&(c)->b) #define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a } #define luai_jmpbuf label_t /* unsupported arches will build but not be able to run lua programs */ #if JMP_BUF_CNT == 1 int setjmp (label_t *buf) { return 1; } void longjmp (label_t * buf) { for (;;); } #endif #else #define LUAI_THROW(L,c) longjmp((c)->b, 1) #define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } #define luai_jmpbuf jmp_buf #endif #else /* _KERNEL */ #if defined(__cplusplus) && !defined(LUA_USE_LONGJMP) /* C++ exceptions */ #define LUAI_THROW(L,c) throw(c) #define LUAI_TRY(L,c,a) \ try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; } #define luai_jmpbuf int /* dummy variable */ #elif defined(LUA_USE_ULONGJMP) /* in Unix, try _longjmp/_setjmp (more efficient) */ #define LUAI_THROW(L,c) _longjmp((c)->b, 1) #define LUAI_TRY(L,c,a) if (_setjmp((c)->b) == 0) { a } #define luai_jmpbuf jmp_buf #else /* default handling with long jumps */ #define LUAI_THROW(L,c) longjmp((c)->b, 1) #define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } #define luai_jmpbuf jmp_buf #endif #endif /* _KERNEL */ #endif /* LUAI_THROW */ /* chain list of long jump buffers */ struct lua_longjmp { struct lua_longjmp *previous; luai_jmpbuf b; volatile int status; /* error code */ }; static void seterrorobj (lua_State *L, int errcode, StkId oldtop) { switch (errcode) { case LUA_ERRMEM: { /* memory error? */ setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */ break; } case LUA_ERRERR: { setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling")); break; } default: { setobjs2s(L, oldtop, L->top - 1); /* error message on current top */ break; } } L->top = oldtop + 1; } /* * Silence infinite recursion warning which was added to -Wall in gcc 12.1 */ #if defined(__GNUC__) && !defined(__clang__) && \ defined(HAVE_KERNEL_INFINITE_RECURSION) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Winfinite-recursion" #endif l_noret luaD_throw (lua_State *L, int errcode) { if (L->errorJmp) { /* thread has an error handler? */ L->errorJmp->status = errcode; /* set status */ LUAI_THROW(L, L->errorJmp); /* jump to it */ } else { /* thread has no error handler */ L->status = cast_byte(errcode); /* mark it as dead */ if (G(L)->mainthread->errorJmp) { /* main thread has a handler? */ setobjs2s(L, G(L)->mainthread->top++, L->top - 1); /* copy error obj. */ luaD_throw(G(L)->mainthread, errcode); /* re-throw in main thread */ } else { /* no handler at all; abort */ if (G(L)->panic) { /* panic function? */ lua_unlock(L); G(L)->panic(L); /* call it (last chance to jump out) */ } panic("no error handler"); } } } #if defined(__GNUC__) && !defined(__clang__) && \ defined(HAVE_INFINITE_RECURSION) #pragma GCC diagnostic pop #endif int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) { unsigned short oldnCcalls = L->nCcalls; struct lua_longjmp lj; lj.status = LUA_OK; lj.previous = L->errorJmp; /* chain new error handler */ L->errorJmp = &lj; LUAI_TRY(L, &lj, (*f)(L, ud); ); L->errorJmp = lj.previous; /* restore old error handler */ L->nCcalls = oldnCcalls; return lj.status; } /* }====================================================== */ static void correctstack (lua_State *L, TValue *oldstack) { CallInfo *ci; GCObject *up; L->top = (L->top - oldstack) + L->stack; for (up = L->openupval; up != NULL; up = up->gch.next) gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack; for (ci = L->ci; ci != NULL; ci = ci->previous) { ci->top = (ci->top - oldstack) + L->stack; ci->func = (ci->func - oldstack) + L->stack; if (isLua(ci)) ci->u.l.base = (ci->u.l.base - oldstack) + L->stack; } } /* some space for error handling */ #define ERRORSTACKSIZE (LUAI_MAXSTACK + 200) void luaD_reallocstack (lua_State *L, int newsize) { TValue *oldstack = L->stack; int lim = L->stacksize; lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE); lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK); luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue); for (; lim < newsize; lim++) setnilvalue(L->stack + lim); /* erase new segment */ L->stacksize = newsize; L->stack_last = L->stack + newsize - EXTRA_STACK; correctstack(L, oldstack); } void luaD_growstack (lua_State *L, int n) { int size = L->stacksize; if (size > LUAI_MAXSTACK) /* error after extra size? */ luaD_throw(L, LUA_ERRERR); else { int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK; int newsize = 2 * size; if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK; if (newsize < needed) newsize = needed; if (newsize > LUAI_MAXSTACK) { /* stack overflow? */ luaD_reallocstack(L, ERRORSTACKSIZE); luaG_runerror(L, "stack overflow"); } else luaD_reallocstack(L, newsize); } } static int stackinuse (lua_State *L) { CallInfo *ci; StkId lim = L->top; for (ci = L->ci; ci != NULL; ci = ci->previous) { lua_assert(ci->top <= L->stack_last); if (lim < ci->top) lim = ci->top; } return cast_int(lim - L->stack) + 1; /* part of stack in use */ } void luaD_shrinkstack (lua_State *L) { int inuse = stackinuse(L); int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK; if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK; if (inuse > LUAI_MAXSTACK || /* handling stack overflow? */ goodsize >= L->stacksize) /* would grow instead of shrink? */ condmovestack(L); /* don't change stack (change only for debugging) */ else luaD_reallocstack(L, goodsize); /* shrink it */ } void luaD_hook (lua_State *L, int event, int line) { lua_Hook hook = L->hook; if (hook && L->allowhook) { CallInfo *ci = L->ci; ptrdiff_t top = savestack(L, L->top); ptrdiff_t ci_top = savestack(L, ci->top); lua_Debug ar; ar.event = event; ar.currentline = line; ar.i_ci = ci; luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ ci->top = L->top + LUA_MINSTACK; lua_assert(ci->top <= L->stack_last); L->allowhook = 0; /* cannot call hooks inside a hook */ ci->callstatus |= CIST_HOOKED; lua_unlock(L); (*hook)(L, &ar); lua_lock(L); lua_assert(!L->allowhook); L->allowhook = 1; ci->top = restorestack(L, ci_top); L->top = restorestack(L, top); ci->callstatus &= ~CIST_HOOKED; } } static void callhook (lua_State *L, CallInfo *ci) { int hook = LUA_HOOKCALL; ci->u.l.savedpc++; /* hooks assume 'pc' is already incremented */ if (isLua(ci->previous) && GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) { ci->callstatus |= CIST_TAIL; hook = LUA_HOOKTAILCALL; } luaD_hook(L, hook, -1); ci->u.l.savedpc--; /* correct 'pc' */ } static StkId adjust_varargs (lua_State *L, Proto *p, int actual) { int i; int nfixargs = p->numparams; StkId base, fixed; lua_assert(actual >= nfixargs); /* move fixed parameters to final position */ luaD_checkstack(L, p->maxstacksize); /* check again for new 'base' */ fixed = L->top - actual; /* first fixed argument */ base = L->top; /* final position of first argument */ for (i=0; itop++, fixed + i); setnilvalue(fixed + i); } return base; } static StkId tryfuncTM (lua_State *L, StkId func) { const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL); StkId p; ptrdiff_t funcr = savestack(L, func); if (!ttisfunction(tm)) luaG_typeerror(L, func, "call"); /* Open a hole inside the stack at `func' */ for (p = L->top; p > func; p--) setobjs2s(L, p, p-1); incr_top(L); func = restorestack(L, funcr); /* previous call may change stack */ setobj2s(L, func, tm); /* tag method is the new function to be called */ return func; } #define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L))) /* ** returns true if function has been executed (C function) */ int luaD_precall (lua_State *L, StkId func, int nresults) { lua_CFunction f; CallInfo *ci; int n; /* number of arguments (Lua) or returns (C) */ ptrdiff_t funcr = savestack(L, func); switch (ttype(func)) { case LUA_TLCF: /* light C function */ f = fvalue(func); goto Cfunc; case LUA_TCCL: { /* C closure */ f = clCvalue(func)->f; Cfunc: luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ ci = next_ci(L); /* now 'enter' new function */ ci->nresults = nresults; ci->func = restorestack(L, funcr); ci->top = L->top + LUA_MINSTACK; lua_assert(ci->top <= L->stack_last); ci->callstatus = 0; luaC_checkGC(L); /* stack grow uses memory */ if (L->hookmask & LUA_MASKCALL) luaD_hook(L, LUA_HOOKCALL, -1); lua_unlock(L); n = (*f)(L); /* do the actual call */ lua_lock(L); api_checknelems(L, n); luaD_poscall(L, L->top - n); return 1; } case LUA_TLCL: { /* Lua function: prepare its call */ StkId base; Proto *p = clLvalue(func)->p; n = cast_int(L->top - func) - 1; /* number of real arguments */ luaD_checkstack(L, p->maxstacksize + p->numparams); for (; n < p->numparams; n++) setnilvalue(L->top++); /* complete missing arguments */ if (!p->is_vararg) { func = restorestack(L, funcr); base = func + 1; } else { base = adjust_varargs(L, p, n); func = restorestack(L, funcr); /* previous call can change stack */ } ci = next_ci(L); /* now 'enter' new function */ ci->nresults = nresults; ci->func = func; ci->u.l.base = base; ci->top = base + p->maxstacksize; lua_assert(ci->top <= L->stack_last); ci->u.l.savedpc = p->code; /* starting point */ ci->callstatus = CIST_LUA; L->top = ci->top; luaC_checkGC(L); /* stack grow uses memory */ if (L->hookmask & LUA_MASKCALL) callhook(L, ci); return 0; } default: { /* not a function */ func = tryfuncTM(L, func); /* retry with 'function' tag method */ return luaD_precall(L, func, nresults); /* now it must be a function */ } } } int luaD_poscall (lua_State *L, StkId firstResult) { StkId res; int wanted, i; CallInfo *ci = L->ci; if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) { if (L->hookmask & LUA_MASKRET) { ptrdiff_t fr = savestack(L, firstResult); /* hook may change stack */ luaD_hook(L, LUA_HOOKRET, -1); firstResult = restorestack(L, fr); } L->oldpc = ci->previous->u.l.savedpc; /* 'oldpc' for caller function */ } res = ci->func; /* res == final position of 1st result */ wanted = ci->nresults; L->ci = ci->previous; /* back to caller */ /* move results to correct place */ for (i = wanted; i != 0 && firstResult < L->top; i--) setobjs2s(L, res++, firstResult++); while (i-- > 0) setnilvalue(res++); L->top = res; return (wanted - LUA_MULTRET); /* 0 iff wanted == LUA_MULTRET */ } /* ** Call a function (C or Lua). The function to be called is at *func. ** The arguments are on the stack, right after the function. ** When returns, all the results are on the stack, starting at the original ** function position. */ void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) { if (++L->nCcalls >= LUAI_MAXCCALLS) { if (L->nCcalls == LUAI_MAXCCALLS) luaG_runerror(L, "C stack overflow"); else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3))) luaD_throw(L, LUA_ERRERR); /* error while handling stack error */ } intptr_t remaining = stack_remaining(); if (L->runerror == 0 && remaining < LUAI_MINCSTACK) luaG_runerror(L, "C stack overflow"); if (L->runerror != 0 && remaining < LUAI_MINCSTACK / 2) luaD_throw(L, LUA_ERRERR); /* error while handling stack error */ if (!allowyield) L->nny++; if (!luaD_precall(L, func, nResults)) /* is a Lua function? */ luaV_execute(L); /* call it */ if (!allowyield) L->nny--; L->nCcalls--; } static void finishCcall (lua_State *L) { CallInfo *ci = L->ci; int n; lua_assert(ci->u.c.k != NULL); /* must have a continuation */ lua_assert(L->nny == 0); if (ci->callstatus & CIST_YPCALL) { /* was inside a pcall? */ ci->callstatus &= ~CIST_YPCALL; /* finish 'lua_pcall' */ L->errfunc = ci->u.c.old_errfunc; } /* finish 'lua_callk'/'lua_pcall' */ adjustresults(L, ci->nresults); /* call continuation function */ if (!(ci->callstatus & CIST_STAT)) /* no call status? */ ci->u.c.status = LUA_YIELD; /* 'default' status */ lua_assert(ci->u.c.status != LUA_OK); ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED; lua_unlock(L); n = (*ci->u.c.k)(L); lua_lock(L); api_checknelems(L, n); /* finish 'luaD_precall' */ luaD_poscall(L, L->top - n); } static void unroll (lua_State *L, void *ud) { UNUSED(ud); for (;;) { if (L->ci == &L->base_ci) /* stack is empty? */ return; /* coroutine finished normally */ if (!isLua(L->ci)) /* C function? */ finishCcall(L); else { /* Lua function */ luaV_finishOp(L); /* finish interrupted instruction */ luaV_execute(L); /* execute down to higher C 'boundary' */ } } } /* ** check whether thread has a suspended protected call */ static CallInfo *findpcall (lua_State *L) { CallInfo *ci; for (ci = L->ci; ci != NULL; ci = ci->previous) { /* search for a pcall */ if (ci->callstatus & CIST_YPCALL) return ci; } return NULL; /* no pending pcall */ } static int recover (lua_State *L, int status) { StkId oldtop; CallInfo *ci = findpcall(L); if (ci == NULL) return 0; /* no recovery point */ /* "finish" luaD_pcall */ oldtop = restorestack(L, ci->extra); luaF_close(L, oldtop); seterrorobj(L, status, oldtop); L->ci = ci; L->allowhook = ci->u.c.old_allowhook; L->nny = 0; /* should be zero to be yieldable */ luaD_shrinkstack(L); L->errfunc = ci->u.c.old_errfunc; ci->callstatus |= CIST_STAT; /* call has error status */ ci->u.c.status = status; /* (here it is) */ return 1; /* continue running the coroutine */ } /* ** signal an error in the call to 'resume', not in the execution of the ** coroutine itself. (Such errors should not be handled by any coroutine ** error handler and should not kill the coroutine.) */ static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) { L->top = firstArg; /* remove args from the stack */ setsvalue2s(L, L->top, luaS_new(L, msg)); /* push error message */ api_incr_top(L); luaD_throw(L, -1); /* jump back to 'lua_resume' */ } /* ** do the work for 'lua_resume' in protected mode */ static void resume_cb (lua_State *L, void *ud) { int nCcalls = L->nCcalls; StkId firstArg = cast(StkId, ud); CallInfo *ci = L->ci; if (nCcalls >= LUAI_MAXCCALLS) resume_error(L, "C stack overflow", firstArg); if (L->status == LUA_OK) { /* may be starting a coroutine */ if (ci != &L->base_ci) /* not in base level? */ resume_error(L, "cannot resume non-suspended coroutine", firstArg); /* coroutine is in base level; start running it */ if (!luaD_precall(L, firstArg - 1, LUA_MULTRET)) /* Lua function? */ luaV_execute(L); /* call it */ } else if (L->status != LUA_YIELD) resume_error(L, "cannot resume dead coroutine", firstArg); else { /* resuming from previous yield */ L->status = LUA_OK; ci->func = restorestack(L, ci->extra); if (isLua(ci)) /* yielded inside a hook? */ luaV_execute(L); /* just continue running Lua code */ else { /* 'common' yield */ if (ci->u.c.k != NULL) { /* does it have a continuation? */ int n; ci->u.c.status = LUA_YIELD; /* 'default' status */ ci->callstatus |= CIST_YIELDED; lua_unlock(L); n = (*ci->u.c.k)(L); /* call continuation */ lua_lock(L); api_checknelems(L, n); firstArg = L->top - n; /* yield results come from continuation */ } luaD_poscall(L, firstArg); /* finish 'luaD_precall' */ } unroll(L, NULL); } lua_assert(nCcalls == L->nCcalls); } LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) { int status; int oldnny = L->nny; /* save 'nny' */ lua_lock(L); luai_userstateresume(L, nargs); L->nCcalls = (from) ? from->nCcalls + 1 : 1; L->nny = 0; /* allow yields */ api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs); status = luaD_rawrunprotected(L, resume_cb, L->top - nargs); if (status == -1) /* error calling 'lua_resume'? */ status = LUA_ERRRUN; else { /* yield or regular error */ while (status != LUA_OK && status != LUA_YIELD) { /* error? */ if (recover(L, status)) /* recover point? */ status = luaD_rawrunprotected(L, unroll, NULL); /* run continuation */ else { /* unrecoverable error */ L->status = cast_byte(status); /* mark thread as `dead' */ seterrorobj(L, status, L->top); L->ci->top = L->top; break; } } lua_assert(status == L->status); } L->nny = oldnny; /* restore 'nny' */ L->nCcalls--; lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0)); lua_unlock(L); return status; } LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) { CallInfo *ci = L->ci; luai_userstateyield(L, nresults); lua_lock(L); api_checknelems(L, nresults); if (L->nny > 0) { if (L != G(L)->mainthread) luaG_runerror(L, "attempt to yield across a C-call boundary"); else luaG_runerror(L, "attempt to yield from outside a coroutine"); } L->status = LUA_YIELD; ci->extra = savestack(L, ci->func); /* save current 'func' */ if (isLua(ci)) { /* inside a hook? */ api_check(L, k == NULL, "hooks cannot continue after yielding"); } else { if ((ci->u.c.k = k) != NULL) /* is there a continuation? */ ci->u.c.ctx = ctx; /* save context */ ci->func = L->top - nresults - 1; /* protect stack below results */ luaD_throw(L, LUA_YIELD); } lua_assert(ci->callstatus & CIST_HOOKED); /* must be inside a hook */ lua_unlock(L); return 0; /* return to 'luaD_hook' */ } int luaD_pcall (lua_State *L, Pfunc func, void *u, ptrdiff_t old_top, ptrdiff_t ef) { int status; CallInfo *old_ci = L->ci; lu_byte old_allowhooks = L->allowhook; unsigned short old_nny = L->nny; ptrdiff_t old_errfunc = L->errfunc; L->errfunc = ef; status = luaD_rawrunprotected(L, func, u); if (status != LUA_OK) { /* an error occurred? */ StkId oldtop = restorestack(L, old_top); luaF_close(L, oldtop); /* close possible pending closures */ seterrorobj(L, status, oldtop); L->ci = old_ci; L->allowhook = old_allowhooks; L->nny = old_nny; luaD_shrinkstack(L); } L->errfunc = old_errfunc; return status; } /* ** Execute a protected parser. */ struct SParser { /* data to `f_parser' */ ZIO *z; Mbuffer buff; /* dynamic structure used by the scanner */ Dyndata dyd; /* dynamic structures used by the parser */ const char *mode; const char *name; }; static void checkmode (lua_State *L, const char *mode, const char *x) { if (mode && strchr(mode, x[0]) == NULL) { luaO_pushfstring(L, "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode); luaD_throw(L, LUA_ERRSYNTAX); } } static void f_parser (lua_State *L, void *ud) { int i; Closure *cl; struct SParser *p = cast(struct SParser *, ud); int c = zgetc(p->z); /* read first character */ lua_assert(c != LUA_SIGNATURE[0]); /* binary not supported */ checkmode(L, p->mode, "text"); cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c); lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues); for (i = 0; i < cl->l.nupvalues; i++) { /* initialize upvalues */ UpVal *up = luaF_newupval(L); cl->l.upvals[i] = up; luaC_objbarrier(L, cl, up); } } int luaD_protectedparser (lua_State *L, ZIO *z, const char *name, const char *mode) { struct SParser p; int status; L->nny++; /* cannot yield during parsing */ p.z = z; p.name = name; p.mode = mode; p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0; p.dyd.gt.arr = NULL; p.dyd.gt.size = 0; p.dyd.label.arr = NULL; p.dyd.label.size = 0; luaZ_initbuffer(L, &p.buff); status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc); luaZ_freebuffer(L, &p.buff); luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size); luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size); luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size); L->nny--; return status; } diff --git a/module/lua/setjmp/setjmp_x86_64.S b/module/lua/setjmp/setjmp_x86_64.S index 7e13fea05dda..337fceb15b00 100644 --- a/module/lua/setjmp/setjmp_x86_64.S +++ b/module/lua/setjmp/setjmp_x86_64.S @@ -1,84 +1,72 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ #if defined(_KERNEL) && defined(__linux__) #include #endif -#ifndef RET -#define RET ret -#endif - -#undef ENTRY -#define ENTRY(x) \ - .text; \ - .align 8; \ - .globl x; \ - .type x, @function; \ -x: - -#define SET_SIZE(x) \ - .size x, [.-x] - /* * Setjmp and longjmp implement non-local gotos using state vectors * type label_t. */ #ifdef __x86_64__ - ENTRY(setjmp) +#define _ASM +#include + +ENTRY_ALIGN(setjmp, 8) movq %rsp, 0(%rdi) movq %rbp, 8(%rdi) movq %rbx, 16(%rdi) movq %r12, 24(%rdi) movq %r13, 32(%rdi) movq %r14, 40(%rdi) movq %r15, 48(%rdi) movq 0(%rsp), %rdx /* return address */ movq %rdx, 56(%rdi) /* rip */ xorl %eax, %eax /* return 0 */ RET SET_SIZE(setjmp) - ENTRY(longjmp) +ENTRY_ALIGN(longjmp, 8) movq 0(%rdi), %rsp movq 8(%rdi), %rbp movq 16(%rdi), %rbx movq 24(%rdi), %r12 movq 32(%rdi), %r13 movq 40(%rdi), %r14 movq 48(%rdi), %r15 movq 56(%rdi), %rdx /* return address */ movq %rdx, 0(%rsp) xorl %eax, %eax incl %eax /* return 1 */ RET SET_SIZE(longjmp) #ifdef __ELF__ .section .note.GNU-stack,"",%progbits #endif #endif /* __x86_64__ */