diff --git a/module/Kbuild.in b/module/Kbuild.in index 7a20e6ee4615..581d50e64b42 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -1,473 +1,469 @@ # When integrated in to a monolithic kernel the spl module must appear # first. This ensures its module initialization function is run before # any of the other module initialization functions which depend on it. ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement ZFS_MODULE_CFLAGS += -Wmissing-prototypes ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ ifneq ($(KBUILD_EXTMOD),) zfs_include = @abs_top_srcdir@/include icp_include = @abs_srcdir@/icp/include zstd_include = @abs_srcdir@/zstd/include ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include src = @abs_srcdir@ obj = @abs_builddir@ else zfs_include = $(srctree)/include/zfs icp_include = $(srctree)/$(src)/icp/include zstd_include = $(srctree)/$(src)/zstd/include ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h endif ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs ZFS_MODULE_CFLAGS += -I$(zfs_include) ZFS_MODULE_CPPFLAGS += -D_KERNEL ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ # KASAN enables -Werror=frame-larger-than=1024, which # breaks oh so many parts of our build. ifeq ($(CONFIG_KASAN),y) ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= endif ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ endif asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) # Suppress unused-value warnings in sparc64 architecture headers ccflags-$(CONFIG_SPARC64) += -Wno-unused-value obj-$(CONFIG_ZFS) := spl.o zfs.o SPL_OBJS := \ spl-atomic.o \ spl-condvar.o \ spl-cred.o \ spl-err.o \ spl-generic.o \ spl-kmem-cache.o \ spl-kmem.o \ spl-kstat.o \ spl-proc.o \ spl-procfs-list.o \ spl-taskq.o \ spl-thread.o \ spl-trace.o \ spl-tsd.o \ spl-vmem.o \ spl-xdr.o \ spl-zlib.o \ spl-zone.o spl-objs += $(addprefix os/linux/spl/,$(SPL_OBJS)) zfs-objs += avl/avl.o ICP_OBJS := \ algs/aes/aes_impl.o \ algs/aes/aes_impl_generic.o \ algs/aes/aes_modes.o \ algs/blake3/blake3.o \ algs/blake3/blake3_generic.o \ algs/blake3/blake3_impl.o \ algs/blake3/blake3_x86-64.o \ algs/edonr/edonr.o \ algs/modes/cbc.o \ algs/modes/ccm.o \ algs/modes/ctr.o \ algs/modes/ecb.o \ algs/modes/gcm.o \ algs/modes/gcm_generic.o \ algs/modes/modes.o \ algs/sha2/sha2.o \ algs/skein/skein.o \ algs/skein/skein_block.o \ algs/skein/skein_iv.o \ api/kcf_cipher.o \ api/kcf_ctxops.o \ api/kcf_mac.o \ core/kcf_callprov.o \ core/kcf_mech_tabs.o \ core/kcf_prov_lib.o \ core/kcf_prov_tabs.o \ core/kcf_sched.o \ illumos-crypto.o \ io/aes.o \ io/sha2_mod.o \ io/skein_mod.o \ spi/kcf_spi.o ICP_OBJS_X86_64 := \ asm-x86_64/aes/aes_aesni.o \ asm-x86_64/aes/aes_amd64.o \ asm-x86_64/aes/aeskey.o \ asm-x86_64/blake3/blake3_avx2.o \ asm-x86_64/blake3/blake3_avx512.o \ asm-x86_64/blake3/blake3_sse2.o \ asm-x86_64/blake3/blake3_sse41.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ asm-x86_64/modes/ghash-x86_64.o \ asm-x86_64/sha2/sha256_impl.o \ asm-x86_64/sha2/sha512_impl.o ICP_OBJS_X86 := \ algs/aes/aes_impl_aesni.o \ algs/aes/aes_impl_x86-64.o \ algs/modes/gcm_pclmulqdq.o ICP_OBJS_ARM64 := \ asm-aarch64/blake3/b3_aarch64_sse2.o \ asm-aarch64/blake3/b3_aarch64_sse41.o ICP_OBJS_PPC_PPC64 := \ asm-ppc64/blake3/b3_ppc64le_sse2.o \ asm-ppc64/blake3/b3_ppc64le_sse41.o zfs-objs += $(addprefix icp/,$(ICP_OBJS)) zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64)) zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include) $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) -# Suppress objtool "can't find jump dest instruction at" warnings. They -# are caused by the constants which are defined in the text section of the -# assembly file using .byte instructions (e.g. bswap_mask). The objtool -# utility tries to interpret them as opcodes and obviously fails doing so. +# Suppress objtool "return with modified stack frame" warnings. OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y -OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y # Suppress objtool "unsupported stack pointer realignment" warnings. We are # not using a DRAP register while aligning the stack to a 64 byte boundary. # See #6950 for the reasoning. OBJECT_FILES_NON_STANDARD_sha256_impl.o := y OBJECT_FILES_NON_STANDARD_sha512_impl.o := y LUA_OBJS := \ lapi.o \ lauxlib.o \ lbaselib.o \ lcode.o \ lcompat.o \ lcorolib.o \ lctype.o \ ldebug.o \ ldo.o \ lfunc.o \ lgc.o \ llex.o \ lmem.o \ lobject.o \ lopcodes.o \ lparser.o \ lstate.o \ lstring.o \ lstrlib.o \ ltable.o \ ltablib.o \ ltm.o \ lvm.o \ lzio.o \ setjmp/setjmp.o zfs-objs += $(addprefix lua/,$(LUA_OBJS)) NVPAIR_OBJS := \ fnvpair.o \ nvpair.o \ nvpair_alloc_fixed.o \ nvpair_alloc_spl.o zfs-objs += $(addprefix nvpair/,$(NVPAIR_OBJS)) UNICODE_OBJS := \ u8_textprep.o \ uconv.o zfs-objs += $(addprefix unicode/,$(UNICODE_OBJS)) ZCOMMON_OBJS := \ cityhash.o \ zfeature_common.o \ zfs_comutil.o \ zfs_deleg.o \ zfs_fletcher.o \ zfs_fletcher_superscalar.o \ zfs_fletcher_superscalar4.o \ zfs_namecheck.o \ zfs_prop.o \ zpool_prop.o \ zprop_common.o ZCOMMON_OBJS_X86 := \ zfs_fletcher_avx512.o \ zfs_fletcher_intel.o \ zfs_fletcher_sse.o ZCOMMON_OBJS_ARM64 := \ zfs_fletcher_aarch64_neon.o zfs-objs += $(addprefix zcommon/,$(ZCOMMON_OBJS)) zfs-$(CONFIG_X86) += $(addprefix zcommon/,$(ZCOMMON_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix zcommon/,$(ZCOMMON_OBJS_X86)) zfs-$(CONFIG_ARM64) += $(addprefix zcommon/,$(ZCOMMON_OBJS_ARM64)) # Zstd uses -O3 by default, so we should follow ZFS_ZSTD_FLAGS := -O3 # -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h # Set it for other compilers, too. ZFS_ZSTD_FLAGS += -fno-tree-vectorize # SSE register return with SSE disabled if -march=znverX is passed ZFS_ZSTD_FLAGS += -U__BMI__ # Quiet warnings about frame size due to unused code in unmodified zstd lib ZFS_ZSTD_FLAGS += -Wframe-larger-than=20480 ZSTD_OBJS := \ zfs_zstd.o \ zstd_sparc.o ZSTD_UPSTREAM_OBJS := \ lib/common/entropy_common.o \ lib/common/error_private.o \ lib/common/fse_decompress.o \ lib/common/pool.o \ lib/common/zstd_common.o \ lib/compress/fse_compress.o \ lib/compress/hist.o \ lib/compress/huf_compress.o \ lib/compress/zstd_compress.o \ lib/compress/zstd_compress_literals.o \ lib/compress/zstd_compress_sequences.o \ lib/compress/zstd_compress_superblock.o \ lib/compress/zstd_double_fast.o \ lib/compress/zstd_fast.o \ lib/compress/zstd_lazy.o \ lib/compress/zstd_ldm.o \ lib/compress/zstd_opt.o \ lib/decompress/huf_decompress.o \ lib/decompress/zstd_ddict.o \ lib/decompress/zstd_decompress.o \ lib/decompress/zstd_decompress_block.o zfs-objs += $(addprefix zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) # Disable aarch64 neon SIMD instructions for kernel mode $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -I$(zstd_include) $(ZFS_ZSTD_FLAGS) $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : asflags-y += -I$(zstd_include) $(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w $(obj)/zstd/zfs_zstd.o : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h ZFS_OBJS := \ abd.o \ aggsum.o \ arc.o \ blake3_zfs.o \ blkptr.o \ bplist.o \ bpobj.o \ bptree.o \ bqueue.o \ btree.o \ dataset_kstats.o \ dbuf.o \ dbuf_stats.o \ ddt.o \ ddt_zap.o \ dmu.o \ dmu_diff.o \ dmu_object.o \ dmu_objset.o \ dmu_recv.o \ dmu_redact.o \ dmu_send.o \ dmu_traverse.o \ dmu_tx.o \ dmu_zfetch.o \ dnode.o \ dnode_sync.o \ dsl_bookmark.o \ dsl_crypt.o \ dsl_dataset.o \ dsl_deadlist.o \ dsl_deleg.o \ dsl_destroy.o \ dsl_dir.o \ dsl_pool.o \ dsl_prop.o \ dsl_scan.o \ dsl_synctask.o \ dsl_userhold.o \ edonr_zfs.o \ fm.o \ gzip.o \ hkdf.o \ lz4.o \ lz4_zfs.o \ lzjb.o \ metaslab.o \ mmp.o \ multilist.o \ objlist.o \ pathname.o \ range_tree.o \ refcount.o \ rrwlock.o \ sa.o \ sha256.o \ skein_zfs.o \ spa.o \ spa_checkpoint.o \ spa_config.o \ spa_errlog.o \ spa_history.o \ spa_log_spacemap.o \ spa_misc.o \ spa_stats.o \ space_map.o \ space_reftree.o \ txg.o \ uberblock.o \ unique.o \ vdev.o \ vdev_cache.o \ vdev_draid.o \ vdev_draid_rand.o \ vdev_indirect.o \ vdev_indirect_births.o \ vdev_indirect_mapping.o \ vdev_initialize.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ vdev_queue.o \ vdev_raidz.o \ vdev_raidz_math.o \ vdev_raidz_math_scalar.o \ vdev_rebuild.o \ vdev_removal.o \ vdev_root.o \ vdev_trim.o \ zap.o \ zap_leaf.o \ zap_micro.o \ zcp.o \ zcp_get.o \ zcp_global.o \ zcp_iter.o \ zcp_set.o \ zcp_synctask.o \ zfeature.o \ zfs_byteswap.o \ zfs_chksum.o \ zfs_fm.o \ zfs_fuid.o \ zfs_ioctl.o \ zfs_log.o \ zfs_onexit.o \ zfs_quota.o \ zfs_ratelimit.o \ zfs_replay.o \ zfs_rlock.o \ zfs_sa.o \ zfs_vnops.o \ zil.o \ zio.o \ zio_checksum.o \ zio_compress.o \ zio_inject.o \ zle.o \ zrlock.o \ zthr.o \ zvol.o ZFS_OBJS_OS := \ abd_os.o \ arc_os.o \ mmp_os.o \ policy.o \ qat.o \ qat_compress.o \ qat_crypt.o \ spa_misc_os.o \ trace.o \ vdev_disk.o \ vdev_file.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_debug.o \ zfs_dir.o \ zfs_file_os.o \ zfs_ioctl_os.o \ zfs_racct.o \ zfs_sysfs.o \ zfs_uio.o \ zfs_vfsops.o \ zfs_vnops_os.o \ zfs_znode.o \ zio_crypt.o \ zpl_ctldir.o \ zpl_export.o \ zpl_file.o \ zpl_inode.o \ zpl_super.o \ zpl_xattr.o \ zvol_os.o ZFS_OBJS_X86 := \ vdev_raidz_math_avx2.o \ vdev_raidz_math_avx512bw.o \ vdev_raidz_math_avx512f.o \ vdev_raidz_math_sse2.o \ vdev_raidz_math_ssse3.o ZFS_OBJS_ARM64 := \ vdev_raidz_math_aarch64_neon.o \ vdev_raidz_math_aarch64_neonx2.o ZFS_OBJS_PPC_PPC64 := \ vdev_raidz_math_powerpc_altivec.o zfs-objs += $(addprefix zfs/,$(ZFS_OBJS)) $(addprefix os/linux/zfs/,$(ZFS_OBJS_OS)) zfs-$(CONFIG_X86) += $(addprefix zfs/,$(ZFS_OBJS_X86)) zfs-$(CONFIG_UML_X86)+= $(addprefix zfs/,$(ZFS_OBJS_X86)) zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64)) zfs-$(CONFIG_PPC) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64)) # Suppress incorrect warnings from versions of objtool which are not # aware of x86 EVEX prefix instructions used for AVX512. OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y ifeq ($(CONFIG_ALTIVEC),y) $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index 7414b3540f34..6da43ee00597 100644 --- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -1,1264 +1,1266 @@ # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # # AES-NI-CTR+GHASH stitch. # # February 2013 # # OpenSSL GCM implementation is organized in such way that its # performance is rather close to the sum of its streamed components, # in the context parallelized AES-NI CTR and modulo-scheduled # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation # was observed to perform significantly better than the sum of the # components on contemporary CPUs, the effort was deemed impossible to # justify. This module is based on combination of Intel submissions, # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles # pressure with notable relative improvement, achieving 1.0 cycle per # byte processed with 128-bit key on Haswell processor, 0.74 - on # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled # measurements for favourable packet size, one divisible by 96. # Applications using the EVP interface will observe a few percent # worse performance.] # # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf # Generated once from # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl # and modified for ICP. Modification are kept at a bare minimum to ease later # upstream merges. #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) #define _ASM #include .extern gcm_avx_can_use_movbe .text #ifdef HAVE_MOVBE .type _aesni_ctr32_ghash_6x,@function .align 32 _aesni_ctr32_ghash_6x: .cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x .align 32 .Loop6x: addl $100663296,%ebx jc .Lhandle_ctr32 vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movbeq 88(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 80(%r14),%r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movbeq 72(%r14),%r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movbeq 64(%r14),%r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movbeq 56(%r14),%r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movbeq 48(%r14),%r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movbeq 40(%r14),%r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movbeq 32(%r14),%r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movbeq 24(%r14),%r13 vaesenc %xmm15,%xmm11,%xmm11 movbeq 16(%r14),%r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movbeq 8(%r14),%r13 vaesenc %xmm1,%xmm13,%xmm13 movbeq 0(%r14),%r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 cmpl $14,%ebp // ICP does not zero key schedule. jb .Lenc_tail vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail .align 32 .Lhandle_ctr32: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32 .align 32 .Lenc_tail: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%r10 subq $0x6,%rdx jc .L6x_done vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x .L6x_done: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 RET .cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x #endif /* ifdef HAVE_MOVBE */ .type _aesni_ctr32_ghash_no_movbe_6x,@function .align 32 _aesni_ctr32_ghash_no_movbe_6x: .cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 vmovdqu 0-128(%rcx),%xmm15 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpaddb %xmm2,%xmm11,%xmm12 vpaddb %xmm2,%xmm12,%xmm13 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm15,%xmm1,%xmm9 vmovdqu %xmm4,16+8(%rsp) jmp .Loop6x_nmb .align 32 .Loop6x_nmb: addl $100663296,%ebx jc .Lhandle_ctr32_nmb vmovdqu 0-32(%r9),%xmm3 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm15,%xmm10,%xmm10 vpxor %xmm15,%xmm11,%xmm11 .Lresume_ctr32_nmb: vmovdqu %xmm1,(%r8) vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 vpxor %xmm15,%xmm12,%xmm12 vmovups 16-128(%rcx),%xmm2 vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 xorq %r12,%r12 cmpq %r14,%r15 vaesenc %xmm2,%xmm9,%xmm9 vmovdqu 48+8(%rsp),%xmm0 vpxor %xmm15,%xmm13,%xmm13 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 vaesenc %xmm2,%xmm10,%xmm10 vpxor %xmm15,%xmm14,%xmm14 setnc %r12b vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vaesenc %xmm2,%xmm11,%xmm11 vmovdqu 16-32(%r9),%xmm3 negq %r12 vaesenc %xmm2,%xmm12,%xmm12 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 vpxor %xmm4,%xmm8,%xmm8 vaesenc %xmm2,%xmm13,%xmm13 vpxor %xmm5,%xmm1,%xmm4 andq $0x60,%r12 vmovups 32-128(%rcx),%xmm15 vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 vaesenc %xmm2,%xmm14,%xmm14 vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 leaq (%r14,%r12,1),%r14 vaesenc %xmm15,%xmm9,%xmm9 vpxor 16+8(%rsp),%xmm8,%xmm8 vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 vmovdqu 64+8(%rsp),%xmm0 vaesenc %xmm15,%xmm10,%xmm10 movq 88(%r14),%r13 bswapq %r13 vaesenc %xmm15,%xmm11,%xmm11 movq 80(%r14),%r12 bswapq %r12 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,32+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,40+8(%rsp) vmovdqu 48-32(%r9),%xmm5 vaesenc %xmm15,%xmm14,%xmm14 vmovups 48-128(%rcx),%xmm15 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm3,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 vaesenc %xmm15,%xmm11,%xmm11 vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 vmovdqu 80+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vpxor %xmm1,%xmm4,%xmm4 vmovdqu 64-32(%r9),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vmovups 64-128(%rcx),%xmm15 vpxor %xmm2,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 vaesenc %xmm15,%xmm10,%xmm10 movq 72(%r14),%r13 bswapq %r13 vpxor %xmm5,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 movq 64(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 vmovdqu 96+8(%rsp),%xmm0 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,48+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,56+8(%rsp) vpxor %xmm2,%xmm4,%xmm4 vmovdqu 96-32(%r9),%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vmovups 80-128(%rcx),%xmm15 vpxor %xmm3,%xmm6,%xmm6 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 vaesenc %xmm15,%xmm10,%xmm10 movq 56(%r14),%r13 bswapq %r13 vpxor %xmm1,%xmm7,%xmm7 vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 vpxor 112+8(%rsp),%xmm8,%xmm8 vaesenc %xmm15,%xmm11,%xmm11 movq 48(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,64+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,72+8(%rsp) vpxor %xmm3,%xmm4,%xmm4 vmovdqu 112-32(%r9),%xmm3 vaesenc %xmm15,%xmm14,%xmm14 vmovups 96-128(%rcx),%xmm15 vpxor %xmm5,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm1,%xmm6,%xmm6 vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 vaesenc %xmm15,%xmm10,%xmm10 movq 40(%r14),%r13 bswapq %r13 vpxor %xmm2,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 movq 32(%r14),%r12 bswapq %r12 vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 vaesenc %xmm15,%xmm12,%xmm12 movq %r13,80+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 movq %r12,88+8(%rsp) vpxor %xmm5,%xmm6,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor %xmm1,%xmm6,%xmm6 vmovups 112-128(%rcx),%xmm15 vpslldq $8,%xmm6,%xmm5 vpxor %xmm2,%xmm4,%xmm4 vmovdqu 16(%r11),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor %xmm8,%xmm7,%xmm7 vaesenc %xmm15,%xmm10,%xmm10 vpxor %xmm5,%xmm4,%xmm4 movq 24(%r14),%r13 bswapq %r13 vaesenc %xmm15,%xmm11,%xmm11 movq 16(%r14),%r12 bswapq %r12 vpalignr $8,%xmm4,%xmm4,%xmm0 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 movq %r13,96+8(%rsp) vaesenc %xmm15,%xmm12,%xmm12 movq %r12,104+8(%rsp) vaesenc %xmm15,%xmm13,%xmm13 vmovups 128-128(%rcx),%xmm1 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vmovups 144-128(%rcx),%xmm15 vaesenc %xmm1,%xmm10,%xmm10 vpsrldq $8,%xmm6,%xmm6 vaesenc %xmm1,%xmm11,%xmm11 vpxor %xmm6,%xmm7,%xmm7 vaesenc %xmm1,%xmm12,%xmm12 vpxor %xmm0,%xmm4,%xmm4 movq 8(%r14),%r13 bswapq %r13 vaesenc %xmm1,%xmm13,%xmm13 movq 0(%r14),%r12 bswapq %r12 vaesenc %xmm1,%xmm14,%xmm14 vmovups 160-128(%rcx),%xmm1 cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. jb .Lenc_tail_nmb vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 176-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 192-128(%rcx),%xmm1 cmpl $14,%ebp // ICP does not zero key schedule. jb .Lenc_tail_nmb vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vaesenc %xmm1,%xmm9,%xmm9 vaesenc %xmm1,%xmm10,%xmm10 vaesenc %xmm1,%xmm11,%xmm11 vaesenc %xmm1,%xmm12,%xmm12 vaesenc %xmm1,%xmm13,%xmm13 vmovups 208-128(%rcx),%xmm15 vaesenc %xmm1,%xmm14,%xmm14 vmovups 224-128(%rcx),%xmm1 jmp .Lenc_tail_nmb .align 32 .Lhandle_ctr32_nmb: vmovdqu (%r11),%xmm0 vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vmovdqu 0-32(%r9),%xmm3 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm15,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm15,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpshufb %xmm0,%xmm14,%xmm14 vpshufb %xmm0,%xmm1,%xmm1 jmp .Lresume_ctr32_nmb .align 32 .Lenc_tail_nmb: vaesenc %xmm15,%xmm9,%xmm9 vmovdqu %xmm7,16+8(%rsp) vpalignr $8,%xmm4,%xmm4,%xmm8 vaesenc %xmm15,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 vpxor 0(%rdi),%xmm1,%xmm2 vaesenc %xmm15,%xmm11,%xmm11 vpxor 16(%rdi),%xmm1,%xmm0 vaesenc %xmm15,%xmm12,%xmm12 vpxor 32(%rdi),%xmm1,%xmm5 vaesenc %xmm15,%xmm13,%xmm13 vpxor 48(%rdi),%xmm1,%xmm6 vaesenc %xmm15,%xmm14,%xmm14 vpxor 64(%rdi),%xmm1,%xmm7 vpxor 80(%rdi),%xmm1,%xmm3 vmovdqu (%r8),%xmm1 vaesenclast %xmm2,%xmm9,%xmm9 vmovdqu 32(%r11),%xmm2 vaesenclast %xmm0,%xmm10,%xmm10 vpaddb %xmm2,%xmm1,%xmm0 movq %r13,112+8(%rsp) leaq 96(%rdi),%rdi vaesenclast %xmm5,%xmm11,%xmm11 vpaddb %xmm2,%xmm0,%xmm5 movq %r12,120+8(%rsp) leaq 96(%rsi),%rsi vmovdqu 0-128(%rcx),%xmm15 vaesenclast %xmm6,%xmm12,%xmm12 vpaddb %xmm2,%xmm5,%xmm6 vaesenclast %xmm7,%xmm13,%xmm13 vpaddb %xmm2,%xmm6,%xmm7 vaesenclast %xmm3,%xmm14,%xmm14 vpaddb %xmm2,%xmm7,%xmm3 addq $0x60,%r10 subq $0x6,%rdx jc .L6x_done_nmb vmovups %xmm9,-96(%rsi) vpxor %xmm15,%xmm1,%xmm9 vmovups %xmm10,-80(%rsi) vmovdqa %xmm0,%xmm10 vmovups %xmm11,-64(%rsi) vmovdqa %xmm5,%xmm11 vmovups %xmm12,-48(%rsi) vmovdqa %xmm6,%xmm12 vmovups %xmm13,-32(%rsi) vmovdqa %xmm7,%xmm13 vmovups %xmm14,-16(%rsi) vmovdqa %xmm3,%xmm14 vmovdqu 32+8(%rsp),%xmm7 jmp .Loop6x_nmb .L6x_done_nmb: vpxor 16+8(%rsp),%xmm8,%xmm8 vpxor %xmm4,%xmm8,%xmm8 RET .cfi_endproc .size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,@function .align 32 aesni_gcm_decrypt: .cfi_startproc xorq %r10,%r10 cmpq $0x60,%rdx jb .Lgcm_dec_abort leaq (%rsp),%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 pushq %r9 .cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 vmovdqu (%r9),%xmm8 andq $-128,%rsp vmovdqu (%r11),%xmm0 leaq 128(%rcx),%rcx movq 32(%r9),%r9 leaq 32(%r9),%r9 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. vpshufb %xmm0,%xmm8,%xmm8 andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Ldec_no_key_aliasing cmpq $768,%r15 jnc .Ldec_no_key_aliasing subq %r15,%rsp .Ldec_no_key_aliasing: vmovdqu 80(%rdi),%xmm7 leaq (%rdi),%r14 vmovdqu 64(%rdi),%xmm4 leaq -192(%rdi,%rdx,1),%r15 vmovdqu 48(%rdi),%xmm5 shrq $4,%rdx xorq %r10,%r10 vmovdqu 32(%rdi),%xmm6 vpshufb %xmm0,%xmm7,%xmm7 vmovdqu 16(%rdi),%xmm2 vpshufb %xmm0,%xmm4,%xmm4 vmovdqu (%rdi),%xmm3 vpshufb %xmm0,%xmm5,%xmm5 vmovdqu %xmm4,48(%rsp) vpshufb %xmm0,%xmm6,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm2,%xmm2 vmovdqu %xmm6,80(%rsp) vpshufb %xmm0,%xmm3,%xmm3 vmovdqu %xmm2,96(%rsp) vmovdqu %xmm3,112(%rsp) #ifdef HAVE_MOVBE #ifdef _KERNEL testl $1,gcm_avx_can_use_movbe(%rip) #else testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) #endif jz 1f call _aesni_ctr32_ghash_6x jmp 2f 1: #endif call _aesni_ctr32_ghash_no_movbe_6x 2: vmovups %xmm9,-96(%rsi) vmovups %xmm10,-80(%rsi) vmovups %xmm11,-64(%rsi) vmovups %xmm12,-48(%rsi) vmovups %xmm13,-32(%rsi) vmovups %xmm14,-16(%rsi) vpshufb (%r11),%xmm8,%xmm8 movq -56(%rax),%r9 .cfi_restore %r9 vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lgcm_dec_abort: movq %r10,%rax RET .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt .type _aesni_ctr32_6x,@function .align 32 _aesni_ctr32_6x: .cfi_startproc vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. vmovups 16-128(%rcx),%xmm15 leaq 32-128(%rcx),%r12 vpxor %xmm4,%xmm1,%xmm9 addl $100663296,%ebx jc .Lhandle_ctr32_2 vpaddb %xmm2,%xmm1,%xmm10 vpaddb %xmm2,%xmm10,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddb %xmm2,%xmm11,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddb %xmm2,%xmm12,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpaddb %xmm2,%xmm13,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpaddb %xmm2,%xmm14,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .align 16 .Loop_ctr32: vaesenc %xmm15,%xmm9,%xmm9 vaesenc %xmm15,%xmm10,%xmm10 vaesenc %xmm15,%xmm11,%xmm11 vaesenc %xmm15,%xmm12,%xmm12 vaesenc %xmm15,%xmm13,%xmm13 vaesenc %xmm15,%xmm14,%xmm14 vmovups (%r12),%xmm15 leaq 16(%r12),%r12 decl %r13d jnz .Loop_ctr32 vmovdqu (%r12),%xmm3 vaesenc %xmm15,%xmm9,%xmm9 vpxor 0(%rdi),%xmm3,%xmm4 vaesenc %xmm15,%xmm10,%xmm10 vpxor 16(%rdi),%xmm3,%xmm5 vaesenc %xmm15,%xmm11,%xmm11 vpxor 32(%rdi),%xmm3,%xmm6 vaesenc %xmm15,%xmm12,%xmm12 vpxor 48(%rdi),%xmm3,%xmm8 vaesenc %xmm15,%xmm13,%xmm13 vpxor 64(%rdi),%xmm3,%xmm2 vaesenc %xmm15,%xmm14,%xmm14 vpxor 80(%rdi),%xmm3,%xmm3 leaq 96(%rdi),%rdi vaesenclast %xmm4,%xmm9,%xmm9 vaesenclast %xmm5,%xmm10,%xmm10 vaesenclast %xmm6,%xmm11,%xmm11 vaesenclast %xmm8,%xmm12,%xmm12 vaesenclast %xmm2,%xmm13,%xmm13 vaesenclast %xmm3,%xmm14,%xmm14 vmovups %xmm9,0(%rsi) vmovups %xmm10,16(%rsi) vmovups %xmm11,32(%rsi) vmovups %xmm12,48(%rsi) vmovups %xmm13,64(%rsi) vmovups %xmm14,80(%rsi) leaq 96(%rsi),%rsi RET .align 32 .Lhandle_ctr32_2: vpshufb %xmm0,%xmm1,%xmm6 vmovdqu 48(%r11),%xmm5 vpaddd 64(%r11),%xmm6,%xmm10 vpaddd %xmm5,%xmm6,%xmm11 vpaddd %xmm5,%xmm10,%xmm12 vpshufb %xmm0,%xmm10,%xmm10 vpaddd %xmm5,%xmm11,%xmm13 vpshufb %xmm0,%xmm11,%xmm11 vpxor %xmm4,%xmm10,%xmm10 vpaddd %xmm5,%xmm12,%xmm14 vpshufb %xmm0,%xmm12,%xmm12 vpxor %xmm4,%xmm11,%xmm11 vpaddd %xmm5,%xmm13,%xmm1 vpshufb %xmm0,%xmm13,%xmm13 vpxor %xmm4,%xmm12,%xmm12 vpshufb %xmm0,%xmm14,%xmm14 vpxor %xmm4,%xmm13,%xmm13 vpshufb %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 .cfi_endproc .size _aesni_ctr32_6x,.-_aesni_ctr32_6x .globl aesni_gcm_encrypt .type aesni_gcm_encrypt,@function .align 32 aesni_gcm_encrypt: .cfi_startproc xorq %r10,%r10 cmpq $288,%rdx jb .Lgcm_enc_abort leaq (%rsp),%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 pushq %r9 .cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 addq $-128,%rsp movl 12(%r8),%ebx leaq .Lbswap_mask(%rip),%r11 leaq -128(%rcx),%r14 movq $0xf80,%r15 leaq 128(%rcx),%rcx vmovdqu (%r11),%xmm0 andq $-128,%rsp movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. andq %r15,%r14 andq %rsp,%r15 subq %r14,%r15 jc .Lenc_no_key_aliasing cmpq $768,%r15 jnc .Lenc_no_key_aliasing subq %r15,%rsp .Lenc_no_key_aliasing: leaq (%rsi),%r14 leaq -192(%rsi,%rdx,1),%r15 shrq $4,%rdx call _aesni_ctr32_6x vpshufb %xmm0,%xmm9,%xmm8 vpshufb %xmm0,%xmm10,%xmm2 vmovdqu %xmm8,112(%rsp) vpshufb %xmm0,%xmm11,%xmm4 vmovdqu %xmm2,96(%rsp) vpshufb %xmm0,%xmm12,%xmm5 vmovdqu %xmm4,80(%rsp) vpshufb %xmm0,%xmm13,%xmm6 vmovdqu %xmm5,64(%rsp) vpshufb %xmm0,%xmm14,%xmm7 vmovdqu %xmm6,48(%rsp) call _aesni_ctr32_6x vmovdqu (%r9),%xmm8 movq 32(%r9),%r9 leaq 32(%r9),%r9 subq $12,%rdx movq $192,%r10 vpshufb %xmm0,%xmm8,%xmm8 #ifdef HAVE_MOVBE #ifdef _KERNEL testl $1,gcm_avx_can_use_movbe(%rip) #else testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) #endif jz 1f call _aesni_ctr32_ghash_6x jmp 2f 1: #endif call _aesni_ctr32_ghash_no_movbe_6x 2: vmovdqu 32(%rsp),%xmm7 vmovdqu (%r11),%xmm0 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm7,%xmm7,%xmm1 vmovdqu 32-32(%r9),%xmm15 vmovups %xmm9,-96(%rsi) vpshufb %xmm0,%xmm9,%xmm9 vpxor %xmm7,%xmm1,%xmm1 vmovups %xmm10,-80(%rsi) vpshufb %xmm0,%xmm10,%xmm10 vmovups %xmm11,-64(%rsi) vpshufb %xmm0,%xmm11,%xmm11 vmovups %xmm12,-48(%rsi) vpshufb %xmm0,%xmm12,%xmm12 vmovups %xmm13,-32(%rsi) vpshufb %xmm0,%xmm13,%xmm13 vmovups %xmm14,-16(%rsi) vpshufb %xmm0,%xmm14,%xmm14 vmovdqu %xmm9,16(%rsp) vmovdqu 48(%rsp),%xmm6 vmovdqu 16-32(%r9),%xmm0 vpunpckhqdq %xmm6,%xmm6,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 vpxor %xmm6,%xmm2,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vmovdqu 64(%rsp),%xmm9 vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm9,%xmm9,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 vpxor %xmm9,%xmm5,%xmm5 vpxor %xmm7,%xmm6,%xmm6 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vmovdqu 80(%rsp),%xmm1 vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm4,%xmm7,%xmm7 vpunpckhqdq %xmm1,%xmm1,%xmm4 vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpxor %xmm6,%xmm9,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 96(%rsp),%xmm2 vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm7,%xmm6,%xmm6 vpunpckhqdq %xmm2,%xmm2,%xmm7 vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 vpxor %xmm2,%xmm7,%xmm7 vpxor %xmm9,%xmm1,%xmm1 vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm5,%xmm4,%xmm4 vpxor 112(%rsp),%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 vmovdqu 112-32(%r9),%xmm0 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpxor %xmm6,%xmm5,%xmm5 vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 vpxor %xmm4,%xmm7,%xmm4 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 vmovdqu 0-32(%r9),%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm1 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 vpxor %xmm14,%xmm1,%xmm1 vpxor %xmm5,%xmm6,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 vmovdqu 32-32(%r9),%xmm15 vpxor %xmm2,%xmm8,%xmm7 vpxor %xmm4,%xmm9,%xmm6 vmovdqu 16-32(%r9),%xmm0 vpxor %xmm5,%xmm7,%xmm9 vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 vpxor %xmm9,%xmm6,%xmm6 vpunpckhqdq %xmm13,%xmm13,%xmm2 vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 vpxor %xmm13,%xmm2,%xmm2 vpslldq $8,%xmm6,%xmm9 vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 vpxor %xmm9,%xmm5,%xmm8 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm6,%xmm7,%xmm7 vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 vmovdqu 48-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm12,%xmm12,%xmm9 vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 vpxor %xmm12,%xmm9,%xmm9 vpxor %xmm14,%xmm13,%xmm13 vpalignr $8,%xmm8,%xmm8,%xmm14 vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 vmovdqu 80-32(%r9),%xmm15 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 vmovdqu 64-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm11,%xmm11,%xmm1 vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm13,%xmm12,%xmm12 vxorps 16(%rsp),%xmm7,%xmm7 vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 vpxor %xmm2,%xmm9,%xmm9 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 vmovdqu 96-32(%r9),%xmm3 vpxor %xmm4,%xmm5,%xmm5 vpunpckhqdq %xmm10,%xmm10,%xmm2 vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 vpxor %xmm10,%xmm2,%xmm2 vpalignr $8,%xmm8,%xmm8,%xmm14 vpxor %xmm12,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 vmovdqu 128-32(%r9),%xmm15 vpxor %xmm9,%xmm1,%xmm1 vxorps %xmm7,%xmm14,%xmm14 vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 vxorps %xmm14,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 vmovdqu 112-32(%r9),%xmm0 vpxor %xmm5,%xmm4,%xmm4 vpunpckhqdq %xmm8,%xmm8,%xmm9 vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 vpxor %xmm8,%xmm9,%xmm9 vpxor %xmm11,%xmm10,%xmm10 vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 vpxor %xmm1,%xmm2,%xmm2 vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 vpxor %xmm4,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 vpxor %xmm10,%xmm7,%xmm7 vpxor %xmm2,%xmm6,%xmm6 vpxor %xmm5,%xmm7,%xmm4 vpxor %xmm4,%xmm6,%xmm6 vpslldq $8,%xmm6,%xmm1 vmovdqu 16(%r11),%xmm3 vpsrldq $8,%xmm6,%xmm6 vpxor %xmm1,%xmm5,%xmm8 vpxor %xmm6,%xmm7,%xmm7 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm2,%xmm8,%xmm8 vpalignr $8,%xmm8,%xmm8,%xmm2 vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 vpxor %xmm7,%xmm2,%xmm2 vpxor %xmm2,%xmm8,%xmm8 vpshufb (%r11),%xmm8,%xmm8 movq -56(%rax),%r9 .cfi_restore %r9 vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lgcm_enc_abort: movq %r10,%rax RET .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt /* Some utility routines */ /* * clear all fpu registers * void clear_fpu_regs_avx(void); */ .globl clear_fpu_regs_avx .type clear_fpu_regs_avx,@function .align 32 clear_fpu_regs_avx: vzeroall RET .size clear_fpu_regs_avx,.-clear_fpu_regs_avx /* * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); * * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and * stores the result at `dst'. The XOR is performed using FPU registers, * so make sure FPU state is saved when running this in the kernel. */ .globl gcm_xor_avx .type gcm_xor_avx,@function .align 32 gcm_xor_avx: movdqu (%rdi), %xmm0 movdqu (%rsi), %xmm1 pxor %xmm1, %xmm0 movdqu %xmm0, (%rsi) RET .size gcm_xor_avx,.-gcm_xor_avx /* * Toggle a boolean_t value atomically and return the new value. * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); */ .globl atomic_toggle_boolean_nv .type atomic_toggle_boolean_nv,@function .align 32 atomic_toggle_boolean_nv: xorl %eax, %eax lock xorl $1, (%rdi) jz 1f movl $1, %eax 1: RET .size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv +.pushsection .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lpoly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .Lone_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Ltwo_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.popsection /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S index 77a3ce185952..d7cdaeb368d7 100644 --- a/module/icp/asm-x86_64/modes/ghash-x86_64.S +++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S @@ -1,717 +1,720 @@ # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # March, June 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH # function features so called "528B" variant utilizing additional # 256+16 bytes of per-key storage [+512 bytes shared table]. # Performance results are for this streamed GHASH subroutine and are # expressed in cycles per processed byte, less is better: # # gcc 3.4.x(*) assembler # # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% # Atom 31.6 16.8 +88% # VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results # are for "528B";-) # (**) it's mystery [to me] why Core2 result is not same as for # Opteron; # May 2010 # # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. # See ghash-x86.pl for background information and details about coding # techniques. # # Special thanks to David Woodhouse for providing access to a # Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # # Overhaul: aggregate Karatsuba post-processing, improve ILP in # reduction_alg9, increase reduction aggregate factor to 4x. As for # the latter. ghash-x86.pl discusses that it makes lesser sense to # increase aggregate factor. Then why increase here? Critical path # consists of 3 independent pclmulqdq instructions, Karatsuba post- # processing and reduction. "On top" of this we lay down aggregated # multiplication operations, triplets of independent pclmulqdq's. As # issue rate for pclmulqdq is limited, it makes lesser sense to # aggregate more multiplications than it takes to perform remaining # non-multiplication operations. 2x is near-optimal coefficient for # contemporary Intel CPUs (therefore modest improvement coefficient), # but not for Bulldozer. Latter is because logical SIMD operations # are twice as slow in comparison to Intel, so that critical path is # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # # Westmere 1.78(+13%) # Sandy Bridge 1.80(+8%) # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Broadwell 0.45(+110%)(if system doesn't support AVX) # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) # Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 # # ... 8x aggregate factor AVX code path is using reduction algorithm # suggested by Shay Gueron[1]. Even though contemporary AVX-capable # CPUs such as Sandy and Ivy Bridge can execute it, the code performs # sub-optimally in comparison to above mentioned version. But thanks # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # # Knights Landing achieves 1.09 cpb. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # Generated once from # https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl # and modified for ICP. Modification are kept at a bare minimum to ease later # upstream merges. #if defined(__x86_64__) && defined(HAVE_AVX) && \ defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) #define _ASM #include .text .globl gcm_gmult_clmul .type gcm_gmult_clmul,@function .align 16 gcm_gmult_clmul: .cfi_startproc .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) RET .cfi_endproc .size gcm_gmult_clmul,.-gcm_gmult_clmul .globl gcm_init_htab_avx .type gcm_init_htab_avx,@function .align 32 gcm_init_htab_avx: .cfi_startproc vzeroupper vmovdqu (%rsi),%xmm2 // KCF/ICP stores H in network byte order with the hi qword first // so we need to swap all bytes, not the 2 qwords. vmovdqu .Lbswap_mask(%rip),%xmm4 vpshufb %xmm4,%xmm2,%xmm2 vpshufd $255,%xmm2,%xmm4 vpsrlq $63,%xmm2,%xmm3 vpsllq $1,%xmm2,%xmm2 vpxor %xmm5,%xmm5,%xmm5 vpcmpgtd %xmm4,%xmm5,%xmm5 vpslldq $8,%xmm3,%xmm3 vpor %xmm3,%xmm2,%xmm2 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm2,%xmm2,%xmm6 vmovdqa %xmm2,%xmm0 vpxor %xmm2,%xmm6,%xmm6 movq $4,%r10 jmp .Linit_start_avx .align 32 .Linit_loop_avx: vpalignr $8,%xmm3,%xmm4,%xmm5 vmovdqu %xmm5,-16(%rdi) vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 .Linit_start_avx: vmovdqa %xmm0,%xmm5 vpunpckhqdq %xmm0,%xmm0,%xmm3 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 vpxor %xmm0,%xmm1,%xmm4 vpxor %xmm4,%xmm3,%xmm3 vpslldq $8,%xmm3,%xmm4 vpsrldq $8,%xmm3,%xmm3 vpxor %xmm4,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 vpsllq $57,%xmm0,%xmm3 vpsllq $62,%xmm0,%xmm4 vpxor %xmm3,%xmm4,%xmm4 vpsllq $63,%xmm0,%xmm3 vpxor %xmm3,%xmm4,%xmm4 vpslldq $8,%xmm4,%xmm3 vpsrldq $8,%xmm4,%xmm4 vpxor %xmm3,%xmm0,%xmm0 vpxor %xmm4,%xmm1,%xmm1 vpsrlq $1,%xmm0,%xmm4 vpxor %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $5,%xmm4,%xmm4 vpxor %xmm4,%xmm0,%xmm0 vpsrlq $1,%xmm0,%xmm0 vpxor %xmm1,%xmm0,%xmm0 vpshufd $78,%xmm5,%xmm3 vpshufd $78,%xmm0,%xmm4 vpxor %xmm5,%xmm3,%xmm3 vmovdqu %xmm5,0(%rdi) vpxor %xmm0,%xmm4,%xmm4 vmovdqu %xmm0,16(%rdi) leaq 48(%rdi),%rdi subq $1,%r10 jnz .Linit_loop_avx vpalignr $8,%xmm4,%xmm3,%xmm5 vmovdqu %xmm5,-16(%rdi) vzeroupper RET .cfi_endproc .size gcm_init_htab_avx,.-gcm_init_htab_avx .globl gcm_gmult_avx .type gcm_gmult_avx,@function .align 32 gcm_gmult_avx: .cfi_startproc jmp .L_gmult_clmul .cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx .globl gcm_ghash_avx .type gcm_ghash_avx,@function .align 32 gcm_ghash_avx: .cfi_startproc vzeroupper vmovdqu (%rdi),%xmm10 leaq .L0x1c2_polynomial(%rip),%r10 leaq 64(%rsi),%rsi vmovdqu .Lbswap_mask(%rip),%xmm13 vpshufb %xmm13,%xmm10,%xmm10 cmpq $0x80,%rcx jb .Lshort_avx subq $0x80,%rcx vmovdqu 112(%rdx),%xmm14 vmovdqu 0-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vmovdqu 32-64(%rsi),%xmm7 vpunpckhqdq %xmm14,%xmm14,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm14,%xmm9,%xmm9 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 80(%rdx),%xmm14 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 48-64(%rsi),%xmm6 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 64(%rdx),%xmm15 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 48(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 32(%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vmovdqu 16(%rdx),%xmm14 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm4,%xmm1,%xmm1 vpshufb %xmm13,%xmm14,%xmm14 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpxor %xmm5,%xmm2,%xmm2 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu (%rdx),%xmm15 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm1,%xmm4,%xmm4 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 leaq 128(%rdx),%rdx cmpq $0x80,%rcx jb .Ltail_avx vpxor %xmm10,%xmm15,%xmm15 subq $0x80,%rcx jmp .Loop8x_avx .align 32 .Loop8x_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vmovdqu 112(%rdx),%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpxor %xmm15,%xmm8,%xmm8 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 vmovdqu 0-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 vmovdqu 32-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vmovdqu 96(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpxor %xmm3,%xmm10,%xmm10 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vxorps %xmm4,%xmm11,%xmm11 vmovdqu 16-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm5,%xmm12,%xmm12 vxorps %xmm15,%xmm8,%xmm8 vmovdqu 80(%rdx),%xmm14 vpxor %xmm10,%xmm12,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpxor %xmm11,%xmm12,%xmm12 vpslldq $8,%xmm12,%xmm9 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vpsrldq $8,%xmm12,%xmm12 vpxor %xmm9,%xmm10,%xmm10 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm14 vxorps %xmm12,%xmm11,%xmm11 vpxor %xmm1,%xmm4,%xmm4 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 80-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 64(%rdx),%xmm15 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vxorps %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vmovdqu 48(%rdx),%xmm14 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 96-64(%rsi),%xmm6 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 128-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu 32(%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpxor %xmm3,%xmm0,%xmm0 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm4,%xmm1,%xmm1 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm5,%xmm2,%xmm2 vxorps %xmm12,%xmm10,%xmm10 vmovdqu 16(%rdx),%xmm14 vpalignr $8,%xmm10,%xmm10,%xmm12 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 vpshufb %xmm13,%xmm14,%xmm14 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 vmovdqu 144-64(%rsi),%xmm6 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 vxorps %xmm11,%xmm12,%xmm12 vpunpckhqdq %xmm14,%xmm14,%xmm9 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 vmovdqu 176-64(%rsi),%xmm7 vpxor %xmm14,%xmm9,%xmm9 vpxor %xmm2,%xmm5,%xmm5 vmovdqu (%rdx),%xmm15 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 vpshufb %xmm13,%xmm15,%xmm15 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 vmovdqu 160-64(%rsi),%xmm6 vpxor %xmm12,%xmm15,%xmm15 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 vpxor %xmm10,%xmm15,%xmm15 leaq 128(%rdx),%rdx subq $0x80,%rcx jnc .Loop8x_avx addq $0x80,%rcx jmp .Ltail_no_xor_avx .align 32 .Lshort_avx: vmovdqu -16(%rdx,%rcx,1),%xmm14 leaq (%rdx,%rcx,1),%rdx vmovdqu 0-64(%rsi),%xmm6 vmovdqu 32-64(%rsi),%xmm7 vpshufb %xmm13,%xmm14,%xmm15 vmovdqa %xmm0,%xmm3 vmovdqa %xmm1,%xmm4 vmovdqa %xmm2,%xmm5 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -32(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 16-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -48(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 48-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 80-64(%rsi),%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -64(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 64-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -80(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 96-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu 128-64(%rsi),%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -96(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 112-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vpsrldq $8,%xmm7,%xmm7 subq $0x10,%rcx jz .Ltail_avx vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vmovdqu -112(%rdx),%xmm14 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vmovdqu 144-64(%rsi),%xmm6 vpshufb %xmm13,%xmm14,%xmm15 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovq 184-64(%rsi),%xmm7 subq $0x10,%rcx jmp .Ltail_avx .align 32 .Ltail_avx: vpxor %xmm10,%xmm15,%xmm15 .Ltail_no_xor_avx: vpunpckhqdq %xmm15,%xmm15,%xmm8 vpxor %xmm0,%xmm3,%xmm3 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 vpxor %xmm15,%xmm8,%xmm8 vpxor %xmm1,%xmm4,%xmm4 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 vpxor %xmm2,%xmm5,%xmm5 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 vmovdqu (%r10),%xmm12 vpxor %xmm0,%xmm3,%xmm10 vpxor %xmm1,%xmm4,%xmm11 vpxor %xmm2,%xmm5,%xmm5 vpxor %xmm10,%xmm5,%xmm5 vpxor %xmm11,%xmm5,%xmm5 vpslldq $8,%xmm5,%xmm9 vpsrldq $8,%xmm5,%xmm5 vpxor %xmm9,%xmm10,%xmm10 vpxor %xmm5,%xmm11,%xmm11 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 vpalignr $8,%xmm10,%xmm10,%xmm10 vpxor %xmm11,%xmm10,%xmm10 vpxor %xmm9,%xmm10,%xmm10 cmpq $0,%rcx jne .Lshort_avx vpshufb %xmm13,%xmm10,%xmm10 vmovdqu %xmm10,(%rdi) vzeroupper RET .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx + +.pushsection .rodata .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .L7_mask: .long 7,0,7,0 .L7_mask_poly: .long 7,0,450,0 .align 64 .type .Lrem_4bit,@object .Lrem_4bit: .long 0,0,0,471859200,0,943718400,0,610271232 .long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 .long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 .long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 .type .Lrem_8bit,@object .Lrem_8bit: .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.popsection /* Mark the stack non-executable. */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */